diff --git a/comfy/ldm/ideogram4/model.py b/comfy/ldm/ideogram4/model.py index 3b02a243a..b86c65bf0 100644 --- a/comfy/ldm/ideogram4/model.py +++ b/comfy/ldm/ideogram4/model.py @@ -174,7 +174,7 @@ class Ideogram4Transformer(nn.Module): llm = self.llm_cond_proj(llm) * text_mask h[:, :L_text] = h[:, :L_text] + llm - h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long)) + h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype) # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch). freqs_cis = precompute_freqs_cis( @@ -235,7 +235,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer): def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options): B = x_chunk.shape[0] device = x_chunk.device - img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + img_tokens = self._img_to_tokens(x_chunk) L_img = img_tokens.shape[1] L_text = context_chunk.shape[1] L = L_text + L_img @@ -268,7 +268,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer): def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options): B = x_chunk.shape[0] device = x_chunk.device - img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + img_tokens = self._img_to_tokens(x_chunk) L_img = img_tokens.shape[1] position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3) diff --git a/comfy/model_management.py b/comfy/model_management.py index dfd58bf1b..8e786c0a5 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -651,8 +651,7 @@ def ensure_pin_budget(size, evict_active=False): to_free = shortfall + PIN_PRESSURE_HYSTERESIS return free_pins(to_free, evict_active=evict_active) >= shortfall -def ensure_pin_registerable(size, evict_active=True): - shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY +def free_registrations(shortfall, evict_active=True): if MAX_PINNED_MEMORY <= 0: return False if shortfall <= 0: @@ -674,6 +673,9 @@ def ensure_pin_registerable(size, evict_active=True): return True return shortfall <= REGISTERABLE_PIN_HYSTERESIS +def ensure_pin_registerable(size, evict_active=True): + return free_registrations(TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY, evict_active=evict_active) + class LoadedModel: def __init__(self, model: ModelPatcher): self._set_model(model) diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index ffe12e0dc..cb77c517a 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -89,13 +89,26 @@ def pin_memory(module, subset="weights", size=None): not comfy.model_management.ensure_pin_registerable(registerable_size)): return _steal_pin(module, stack, buckets, size, priority) + extended = False try: - hostbuf.extend(size=size) + hostbuf.extend(size=size, register=False) + extended = True + pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] + pin.untyped_storage()._comfy_hostbuf = hostbuf + if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + comfy.model_management.free_registrations(size) + if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + del pin + hostbuf.truncate(offset, do_unregister=False) + return _steal_pin(module, stack, buckets, size, priority) except RuntimeError: + if extended: + hostbuf.truncate(offset, do_unregister=False) return _steal_pin(module, stack, buckets, size, priority) - module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] - module._pin.untyped_storage()._comfy_hostbuf = hostbuf + module._pin = pin stack.append((module, offset)) module._pin_registered = True module._pin_stack_index = len(stack) - 1 diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 6592f6b1d..b48713d41 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -285,7 +285,7 @@ class AudioSaveHelper: results = [] for batch_number, waveform in enumerate(audio["waveform"].cpu()): filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) - file = f"{filename_with_batch_num}_{counter:05}_.{format}" + file = f"{filename_with_batch_num}_{counter:05}.{format}" output_path = os.path.join(full_output_folder, file) # Use original sample rate initially diff --git a/comfy_api_nodes/apis/bfl.py b/comfy_api_nodes/apis/bfl.py index 2ad651122..4c950da84 100644 --- a/comfy_api_nodes/apis/bfl.py +++ b/comfy_api_nodes/apis/bfl.py @@ -43,6 +43,7 @@ class BFLFluxEraseRequest(BaseModel): "white (255) marks areas to remove, black (0) marks areas to preserve.", ) dilate_pixels: int = Field(10) + seed: int | None = Field(None) output_format: str = Field("png") diff --git a/comfy_api_nodes/apis/bria.py b/comfy_api_nodes/apis/bria.py index e08a519a8..7a98428c3 100644 --- a/comfy_api_nodes/apis/bria.py +++ b/comfy_api_nodes/apis/bria.py @@ -97,3 +97,28 @@ class BriaRemoveVideoBackgroundResult(BaseModel): class BriaRemoveVideoBackgroundResponse(BaseModel): status: str = Field(...) result: BriaRemoveVideoBackgroundResult | None = Field(None) + + +class BriaVideoGreenScreenRequest(BaseModel): + video: str = Field(..., description="Publicly accessible URL of the input video.") + green_shade: str = Field( + default="broadcast_green", + description="Solid chroma-key shade applied behind the foreground " + "(broadcast_green, chroma_green, or blue_screen).", + ) + output_container_and_codec: str = Field(...) + preserve_audio: bool = Field(True) + seed: int = Field(...) + + +class BriaVideoReplaceBackgroundRequest(BaseModel): + video: str = Field(..., description="Publicly accessible URL of the input (foreground) video.") + background_url: str = Field( + ..., + description="Publicly accessible URL of the background image or video to composite behind " + "the foreground. Stretched to the foreground frame; match its aspect ratio for " + "undistorted results.", + ) + output_container_and_codec: str = Field(...) + preserve_audio: bool = Field(True) + seed: int = Field(...) diff --git a/comfy_api_nodes/apis/gemini.py b/comfy_api_nodes/apis/gemini.py index 22879fe18..caaba8f36 100644 --- a/comfy_api_nodes/apis/gemini.py +++ b/comfy_api_nodes/apis/gemini.py @@ -108,13 +108,19 @@ class GeminiVideoMetadata(BaseModel): startOffset: GeminiOffset | None = Field(None) +class GeminiThinkingConfig(BaseModel): + includeThoughts: bool | None = Field(None) + thinkingLevel: str = Field(...) + + class GeminiGenerationConfig(BaseModel): - maxOutputTokens: int | None = Field(None, ge=16, le=8192) + maxOutputTokens: int | None = Field(None, ge=16, le=65536) seed: int | None = Field(None) stopSequences: list[str] | None = Field(None) temperature: float | None = Field(None, ge=0.0, le=2.0) topK: int | None = Field(None, ge=1) topP: float | None = Field(None, ge=0.0, le=1.0) + thinkingConfig: GeminiThinkingConfig | None = Field(None) class GeminiImageOutputOptions(BaseModel): @@ -128,11 +134,6 @@ class GeminiImageConfig(BaseModel): imageOutputOptions: GeminiImageOutputOptions = Field(default_factory=GeminiImageOutputOptions) -class GeminiThinkingConfig(BaseModel): - includeThoughts: bool | None = Field(None) - thinkingLevel: str = Field(...) - - class GeminiImageGenerationConfig(GeminiGenerationConfig): responseModalities: list[str] | None = Field(None) imageConfig: GeminiImageConfig | None = Field(None) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index 79961ff9d..259c54ef9 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -534,6 +534,15 @@ class FluxEraseNode(IO.ComfyNode): max=25, tooltip="Expands the mask boundaries to ensure clean coverage of the object's edges.", ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + optional=True, + ), ], outputs=[IO.Image.Output()], hidden=[ @@ -553,6 +562,7 @@ class FluxEraseNode(IO.ComfyNode): image: Input.Image, mask: Input.Image, dilate_pixels: int = 10, + seed: int = 0, ) -> IO.NodeOutput: validate_image_dimensions(image, min_width=256, min_height=256) mask = resize_mask_to_image(mask, image) @@ -565,6 +575,7 @@ class FluxEraseNode(IO.ComfyNode): image=tensor_to_base64_string(image[:, :, :, :3]), # make sure image will have alpha channel removed mask=mask, dilate_pixels=dilate_pixels, + seed=seed, ), ) diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py index ce2c9e9be..e138fafa9 100644 --- a/comfy_api_nodes/nodes_bria.py +++ b/comfy_api_nodes/nodes_bria.py @@ -12,6 +12,8 @@ from comfy_api_nodes.apis.bria import ( BriaRemoveVideoBackgroundRequest, BriaRemoveVideoBackgroundResponse, BriaStatusResponse, + BriaVideoGreenScreenRequest, + BriaVideoReplaceBackgroundRequest, InputModerationSettings, ) from comfy_api_nodes.util import ( @@ -319,6 +321,158 @@ class BriaRemoveVideoBackground(IO.ComfyNode): return IO.NodeOutput(await download_url_to_video_output(response.result.video_url)) +class BriaVideoGreenScreen(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="BriaVideoGreenScreen", + display_name="Bria Video Green Screen", + category="partner/video/Bria", + description="Replace a video's background with a solid chroma-key screen using Bria.", + inputs=[ + IO.Video.Input("video"), + IO.Combo.Input( + "green_shade", + options=["broadcast_green", "chroma_green", "blue_screen"], + tooltip="Solid chroma-key shade applied behind the foreground: " + "broadcast_green (#00B140), chroma_green (#00FF00), or blue_screen (#0000FF).", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed controls whether the node should re-run; " + "results are non-deterministic regardless of seed.", + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""", + ), + ) + + @classmethod + async def execute( + cls, + video: Input.Video, + green_shade: str, + seed: int, + ) -> IO.NodeOutput: + validate_video_duration(video, max_duration=60.0) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bria/v2/video/edit/green_screen", method="POST"), + data=BriaVideoGreenScreenRequest( + video=await upload_video_to_comfyapi(cls, video), + green_shade=green_shade, + output_container_and_codec="mp4_h264", + seed=seed, + ), + response_model=BriaStatusResponse, + ) + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"), + status_extractor=lambda r: r.status, + response_model=BriaRemoveVideoBackgroundResponse, + ) + return IO.NodeOutput(await download_url_to_video_output(response.result.video_url)) + + +class BriaVideoReplaceBackground(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="BriaVideoReplaceBackground", + display_name="Bria Video Replace Background", + category="partner/video/Bria", + description="Replace a video's background with a supplied image or video using Bria. " + "The output keeps the foreground's resolution and frame rate; a background with a " + "different aspect ratio is stretched to fit, so match it for undistorted results.", + inputs=[ + IO.Video.Input("video", tooltip="Foreground video whose background is replaced."), + IO.Image.Input( + "background_image", + optional=True, + tooltip="Background image to composite behind the foreground. " + "Provide either a background image or a background video, not both.", + ), + IO.Video.Input( + "background_video", + optional=True, + tooltip="Background video to composite behind the foreground. " + "Provide either a background image or a background video, not both.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed controls whether the node should re-run; " + "results are non-deterministic regardless of seed.", + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""", + ), + ) + + @classmethod + async def execute( + cls, + video: Input.Video, + seed: int, + background_image: Input.Image | None = None, + background_video: Input.Video | None = None, + ) -> IO.NodeOutput: + if (background_image is None) == (background_video is None): + raise ValueError("Provide either a background image or a background video, not both.") + validate_video_duration(video, max_duration=60.0) + if background_video is not None: + validate_video_duration(background_video, max_duration=60.0) + background_url = await upload_video_to_comfyapi(cls, background_video, wait_label="Uploading background") + else: + background_url = await upload_image_to_comfyapi(cls, background_image, wait_label="Uploading background") + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bria/v2/video/edit/replace_background", method="POST"), + data=BriaVideoReplaceBackgroundRequest( + video=await upload_video_to_comfyapi(cls, video), + background_url=background_url, + output_container_and_codec="mp4_h264", + seed=seed, + ), + response_model=BriaStatusResponse, + ) + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"), + status_extractor=lambda r: r.status, + response_model=BriaRemoveVideoBackgroundResponse, + ) + return IO.NodeOutput(await download_url_to_video_output(response.result.video_url)) + + def _video_to_images_and_mask(video: Input.Video) -> tuple[Input.Image, Input.Mask]: """Decode a transparent webm (VP9 + alpha) into image frames and an alpha mask. @@ -416,6 +570,8 @@ class BriaExtension(ComfyExtension): BriaImageEditNode, BriaRemoveImageBackground, BriaRemoveVideoBackground, + BriaVideoGreenScreen, + # BriaVideoReplaceBackground, # server returns Status 500 when we pass background video BriaTransparentVideoBackground, ] diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index d8885a7e5..c30ddc446 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -7,6 +7,7 @@ from io import BytesIO import torch from typing_extensions import override +from comfy.utils import common_upscale from comfy_api.latest import IO, ComfyExtension, Input, Types from comfy_api_nodes.apis.bytedance import ( RECOMMENDED_PRESETS, @@ -131,6 +132,44 @@ def _prepare_seedance_image(image: Input.Image) -> Input.Image: return image +# Supported output aspect ratios, used to pre-size FLF frames to matching pixel pair to avoid the 1080p stretch jump. +SEEDANCE2_RATIO_WH = { + "16:9": (16, 9), + "4:3": (4, 3), + "1:1": (1, 1), + "3:4": (3, 4), + "9:16": (9, 16), + "21:9": (21, 9), +} +SEEDANCE2_RES_SHORT_SIDE = {"480p": 480, "720p": 720, "1080p": 1080} + + +def _seedance2_target_dims(resolution: str, ratio: str, image: torch.Tensor) -> tuple[int, int]: + """Exact supported output (width, height) for (resolution, ratio). + + The shorter side equals the resolution number (e.g. 1080p 16:9 -> 1920x1080). For ratio + "adaptive" (or any unexpected value) the ratio is derived from the image's own aspect, snapped + to the nearest supported ratio, so the output keeps the frame's orientation. + """ + short = SEEDANCE2_RES_SHORT_SIDE[resolution] + if ratio not in SEEDANCE2_RATIO_WH: + aspect = image.shape[-2] / image.shape[-3] # W / H; tensor is (B, H, W, C) + ratio = min(SEEDANCE2_RATIO_WH, key=lambda k: abs(SEEDANCE2_RATIO_WH[k][0] / SEEDANCE2_RATIO_WH[k][1] - aspect)) + rw, rh = SEEDANCE2_RATIO_WH[ratio] + if rw >= rh: # landscape or square: shorter side is the height + out_w, out_h = round(short * rw / rh), short + else: # portrait: shorter side is the width + out_w, out_h = short, round(short * rh / rw) + return out_w - out_w % 2, out_h - out_h % 2 + + +def _resize_to_exact(image: torch.Tensor, width: int, height: int) -> torch.Tensor: + """Center-crop to the target aspect and resize to exactly width x height (lanczos).""" + samples = image.movedim(-1, 1) # (B, H, W, C) -> (B, C, H, W) + resized = common_upscale(samples, width, height, "lanczos", "center") + return resized.movedim(1, -1) + + async def _resolve_reference_assets( cls: type[IO.ComfyNode], asset_ids: list[str], @@ -1790,10 +1829,28 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): if last_frame is not None and last_frame_asset_id: raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.") - if first_frame is not None: - first_frame = _prepare_seedance_image(first_frame) - if last_frame is not None: - last_frame = _prepare_seedance_image(last_frame) + request_ratio = model["ratio"] + if first_frame_asset_id or last_frame_asset_id: + if first_frame is not None: + first_frame = _prepare_seedance_image(first_frame) + if last_frame is not None: + last_frame = _prepare_seedance_image(last_frame) + else: + # The 1080p FLF stretch fix (pre-size frames to a supported pixel pair + submit ratio="adaptive") + # only applies to local image inputs we can resize. + request_ratio = "adaptive" + target_dims: tuple[int, int] | None = None + if first_frame is not None: + validate_image_aspect_ratio(first_frame, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + validate_image_dimensions(first_frame, min_width=300, min_height=300) + target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], first_frame) + first_frame = _resize_to_exact(first_frame, *target_dims) + if last_frame is not None: + validate_image_aspect_ratio(last_frame, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + validate_image_dimensions(last_frame, min_width=300, min_height=300) + if target_dims is None: + target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], last_frame) + last_frame = _resize_to_exact(last_frame, *target_dims) asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a] image_assets: dict[str, str] = {} @@ -1844,7 +1901,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): content=content, generate_audio=model["generate_audio"], resolution=model["resolution"], - ratio=model["ratio"], + ratio=request_ratio, duration=model["duration"], seed=seed, watermark=watermark, diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index e75ef3835..2699d2792 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -8,7 +8,7 @@ import os from enum import Enum from fnmatch import fnmatch from io import BytesIO -from typing import Literal +from typing import Any, Literal import torch from typing_extensions import override @@ -19,6 +19,7 @@ from comfy_api_nodes.apis.gemini import ( GeminiContent, GeminiFileData, GeminiGenerateContentRequest, + GeminiGenerationConfig, GeminiGenerateContentResponse, GeminiImageConfig, GeminiImageGenerateContentRequest, @@ -40,13 +41,18 @@ from comfy_api_nodes.util import ( get_number_of_images, sync_op, tensor_to_base64_string, + upload_audio_to_comfyapi, + upload_image_to_comfyapi, upload_images_to_comfyapi, + upload_video_to_comfyapi, validate_string, video_to_base64_string, ) GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB +GEMINI_URL_INPUT_BUDGET = 10 +GEMINI_MAX_INLINE_BYTES = 18 * 1024 * 1024 GEMINI_IMAGE_SYS_PROMPT = ( "You are an expert image-generation engine. You must ALWAYS produce an image.\n" "Interpret all user input—regardless of " @@ -285,6 +291,140 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N return final_price / 1_000_000.0 +def create_video_parts(video_input: Input.Video) -> list[GeminiPart]: + """Convert a single video input to Gemini API compatible parts (inline MP4/H.264).""" + base_64_string = video_to_base64_string( + video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264 + ) + return [ + GeminiPart( + inlineData=GeminiInlineData( + mimeType=GeminiMimeType.video_mp4, + data=base_64_string, + ) + ) + ] + + +def create_audio_parts(audio_input: Input.Audio) -> list[GeminiPart]: + """Convert an audio input to Gemini API compatible parts (one inline MP3 part per batch item).""" + audio_parts: list[GeminiPart] = [] + for batch_index in range(audio_input["waveform"].shape[0]): + # Recreate an IO.AUDIO object for the given batch dimension index + audio_at_index = Input.Audio( + waveform=audio_input["waveform"][batch_index].unsqueeze(0), + sample_rate=audio_input["sample_rate"], + ) + # Convert to MP3 format for compatibility with Gemini API + audio_bytes = audio_to_base64_string( + audio_at_index, + container_format="mp3", + codec_name="libmp3lame", + ) + audio_parts.append( + GeminiPart( + inlineData=GeminiInlineData( + mimeType=GeminiMimeType.audio_mp3, + data=audio_bytes, + ) + ) + ) + return audio_parts + + +def _flatten_images(images: list[Input.Image]) -> list[torch.Tensor]: + """Expand any batched image tensors into individual (H, W, C) frames, preserving order.""" + frames: list[torch.Tensor] = [] + for img in images: + if len(img.shape) == 4: + frames.extend(img[i] for i in range(img.shape[0])) + else: + frames.append(img) + return frames + + +def _flatten_audio(audios: list[Input.Audio]) -> list[Input.Audio]: + """Expand any batched audio inputs into individual single-clip audio inputs, preserving order.""" + clips: list[Input.Audio] = [] + for audio in audios: + waveform = audio["waveform"] + for i in range(waveform.shape[0]): + clips.append(Input.Audio(waveform=waveform[i].unsqueeze(0), sample_rate=audio["sample_rate"])) + return clips + + +async def _media_url_part(cls: type[IO.ComfyNode], kind: str, payload: Any) -> GeminiPart: + """Upload a single media unit to ComfyAPI storage and return a fileData (URL) part.""" + if kind == "image": + url = await upload_image_to_comfyapi(cls, payload, mime_type="image/png", wait_label="Uploading image") + return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.image_png, fileUri=url)) + if kind == "audio": + url = await upload_audio_to_comfyapi( + cls, payload, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mp3" + ) + return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.audio_mp3, fileUri=url)) + url = await upload_video_to_comfyapi(cls, payload, wait_label="Uploading video") + return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.video_mp4, fileUri=url)) + + +def _media_inline_part(kind: str, payload: Any) -> tuple[GeminiPart, int]: + """Encode a single media unit as an inline base64 part; returns (part, base64_length).""" + if kind == "image": + data = tensor_to_base64_string(payload, mime_type="image/webp") + mime = GeminiMimeType.image_webp + elif kind == "audio": + data = audio_to_base64_string(payload, container_format="mp3", codec_name="libmp3lame") + mime = GeminiMimeType.audio_mp3 + else: + data = video_to_base64_string( + payload, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264 + ) + mime = GeminiMimeType.video_mp4 + return GeminiPart(inlineData=GeminiInlineData(mimeType=mime, data=data)), len(data) + + +async def build_gemini_media_parts( + cls: type[IO.ComfyNode], + images: list[Input.Image], + audios: list[Input.Audio], + videos: list[Input.Video], + *, + url_budget: int = GEMINI_URL_INPUT_BUDGET, + max_inline_bytes: int = GEMINI_MAX_INLINE_BYTES, +) -> list[GeminiPart]: + """Build Gemini parts for multimodal inputs (images, audio, video). + + fileData URLs are preferred for every media type: the upload is fetched directly by the + model, keeping the request body tiny regardless of media size. The URL budget is shared + across all media and assigned largest-first (video, then audio, then images), so that if it + is ever exhausted the inline-base64 overflow is limited to the smallest items. Total inline + payload is capped by `max_inline_bytes`. + """ + units: list[tuple[str, Any]] = ( + [("video", v) for v in videos] + + [("audio", a) for a in _flatten_audio(audios)] + + [("image", f) for f in _flatten_images(images)] + ) + + parts: list[GeminiPart] = [] + url_used = 0 + inline_bytes = 0 + for kind, payload in units: + if url_used < url_budget: + parts.append(await _media_url_part(cls, kind, payload)) + url_used += 1 + continue + part, nbytes = _media_inline_part(kind, payload) + inline_bytes += nbytes + if inline_bytes > max_inline_bytes: + raise ValueError( + f"Too much media to send inline (over {max_inline_bytes // (1024 * 1024)}MB after the first " + f"{url_budget} inputs are uploaded as URLs). Reduce the number or size of attached media." + ) + parts.append(part) + return parts + + class GeminiNode(IO.ComfyNode): """ Node to generate text responses from a Gemini model. @@ -407,58 +547,9 @@ class GeminiNode(IO.ComfyNode): ) """, ), + is_deprecated=True, ) - @classmethod - def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]: - """Convert video input to Gemini API compatible parts.""" - - base_64_string = video_to_base64_string( - video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264 - ) - return [ - GeminiPart( - inlineData=GeminiInlineData( - mimeType=GeminiMimeType.video_mp4, - data=base_64_string, - ) - ) - ] - - @classmethod - def create_audio_parts(cls, audio_input: Input.Audio) -> list[GeminiPart]: - """ - Convert audio input to Gemini API compatible parts. - - Args: - audio_input: Audio input from ComfyUI, containing waveform tensor and sample rate. - - Returns: - List of GeminiPart objects containing the encoded audio. - """ - audio_parts: list[GeminiPart] = [] - for batch_index in range(audio_input["waveform"].shape[0]): - # Recreate an IO.AUDIO object for the given batch dimension index - audio_at_index = Input.Audio( - waveform=audio_input["waveform"][batch_index].unsqueeze(0), - sample_rate=audio_input["sample_rate"], - ) - # Convert to MP3 format for compatibility with Gemini API - audio_bytes = audio_to_base64_string( - audio_at_index, - container_format="mp3", - codec_name="libmp3lame", - ) - audio_parts.append( - GeminiPart( - inlineData=GeminiInlineData( - mimeType=GeminiMimeType.audio_mp3, - data=audio_bytes, - ) - ) - ) - return audio_parts - @classmethod async def execute( cls, @@ -482,9 +573,9 @@ class GeminiNode(IO.ComfyNode): if images is not None: parts.extend(await create_image_parts(cls, images)) if audio is not None: - parts.extend(cls.create_audio_parts(audio)) + parts.extend(create_audio_parts(audio)) if video is not None: - parts.extend(cls.create_video_parts(video)) + parts.extend(create_video_parts(video)) if files is not None: parts.extend(files) @@ -512,6 +603,210 @@ class GeminiNode(IO.ComfyNode): return IO.NodeOutput(output_text or "Empty response from Gemini model...") +GEMINI_V2_MODELS: dict[str, str] = { + "Gemini 3.1 Pro": "gemini-3.1-pro-preview", + "Gemini 3.1 Flash-Lite": "gemini-3.1-flash-lite-preview", +} + + +def _gemini_text_model_inputs(thinking_default: str) -> list[Input]: + """Per-model inputs revealed by the model DynamicCombo (shared media + sampling controls).""" + return [ + IO.Autogrow.Input( + "images", + template=IO.Autogrow.TemplateNames( + IO.Image.Input("image"), + names=[f"image_{i}" for i in range(1, 17)], + min=0, + ), + tooltip="Optional image(s) to use as context for the model. Up to 16 images.", + ), + IO.Autogrow.Input( + "audio", + template=IO.Autogrow.TemplateNames( + IO.Audio.Input("audio"), + names=["audio_1"], + min=0, + ), + tooltip="Optional audio clip to use as context for the model.", + ), + IO.Autogrow.Input( + "video", + template=IO.Autogrow.TemplateNames( + IO.Video.Input("video"), + names=["video_1"], + min=0, + ), + tooltip="Optional video clip to use as context for the model.", + ), + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Input Files node.", + ), + IO.Combo.Input( + "thinking_level", + options=["LOW", "HIGH"], + default=thinking_default, + tooltip="How hard the model reasons internally before answering. " + "HIGH improves quality on difficult tasks but costs more (thinking) tokens and is slower.", + ), + IO.Float.Input( + "temperature", + default=1.0, + min=0.0, + max=2.0, + step=0.01, + tooltip="Controls randomness. Lower is more focused/deterministic, higher is more creative.", + advanced=True, + ), + IO.Float.Input( + "top_p", + default=0.95, + min=0.0, + max=1.0, + step=0.01, + tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.", + advanced=True, + ), + IO.Int.Input( + "max_output_tokens", + default=32768, + min=16, + max=65536, + tooltip="Maximum tokens to generate, including the model's internal thinking. " + "With thinking_level HIGH, a low value can leave no room for the answer; raise this if " + "responses come back empty or truncated. The model stops early when finished, so a higher " + "cap costs nothing extra for short replies.", + advanced=True, + ), + ] + + +class GeminiNodeV2(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GeminiNodeV2", + display_name="Google Gemini", + category="partner/text/Gemini", + essentials_category="Text Generation", + description="Generate text responses with Google's Gemini models. Provide a text prompt and, " + "optionally, one or more images, audio clips, videos, or files as multimodal context.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text input to the model. Include detailed instructions, questions, or context.", + ), + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option("Gemini 3.1 Pro", _gemini_text_model_inputs("HIGH")), + IO.DynamicCombo.Option("Gemini 3.1 Flash-Lite", _gemini_text_model_inputs("LOW")), + ], + tooltip="The Gemini model used to generate the response.", + ), + IO.Int.Input( + "seed", + default=42, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for sampling. Set to 0 for a random seed. Deterministic output isn't guaranteed.", + ), + IO.String.Input( + "system_prompt", + multiline=True, + default="", + optional=True, + advanced=True, + tooltip="Foundational instructions that dictate the model's behavior.", + ), + ], + outputs=[ + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model"]), + expr=""" + ( + $m := widgets.model; + $contains($m, "lite") ? { + "type": "list_usd", + "usd": [0.00025, 0.0015], + "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" } + } : { + "type": "list_usd", + "usd": [0.002, 0.012], + "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" } + } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + model: dict, + seed: int, + system_prompt: str = "", + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + model_id = GEMINI_V2_MODELS[model["model"]] + + parts: list[GeminiPart] = [GeminiPart(text=prompt)] + images = [t for t in (model.get("images") or {}).values() if t is not None] + audios = [a for a in (model.get("audio") or {}).values() if a is not None] + videos = [v for v in (model.get("video") or {}).values() if v is not None] + if images or audios or videos: + parts.extend(await build_gemini_media_parts(cls, images, audios, videos)) + files = model.get("files") + if files is not None: + parts.extend(files) + + gemini_system_prompt = None + if system_prompt: + gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None) + + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"), + data=GeminiGenerateContentRequest( + contents=[ + GeminiContent( + role=GeminiRole.user, + parts=parts, + ) + ], + generationConfig=GeminiGenerationConfig( + temperature=model["temperature"], + topP=model["top_p"], + maxOutputTokens=model["max_output_tokens"], + seed=seed if seed > 0 else None, + thinkingConfig=GeminiThinkingConfig(thinkingLevel=model["thinking_level"]), + ), + systemInstruction=gemini_system_prompt, + ), + response_model=GeminiGenerateContentResponse, + price_extractor=calculate_tokens_price, + ) + + output_text = get_text_from_response(response) + return IO.NodeOutput(output_text or "Empty response from Gemini model...") + + class GeminiInputFiles(IO.ComfyNode): """ Loads and formats input files for use with the Gemini API. @@ -1222,6 +1517,7 @@ class GeminiExtension(ComfyExtension): async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ GeminiNode, + GeminiNodeV2, GeminiImage, GeminiImage2, GeminiNanoBanana2, diff --git a/comfy_api_nodes/nodes_krea.py b/comfy_api_nodes/nodes_krea.py index 34369f05f..b9e6268f2 100644 --- a/comfy_api_nodes/nodes_krea.py +++ b/comfy_api_nodes/nodes_krea.py @@ -42,9 +42,11 @@ async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Ima _MODEL_MEDIUM = "Krea 2 Medium" +_MODEL_MEDIUM_TURBO = "Krea 2 Medium Turbo" _MODEL_LARGE = "Krea 2 Large" _MODEL_ENDPOINTS: dict[str, str] = { _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium", + _MODEL_MEDIUM_TURBO: "/proxy/krea/generate/image/krea/krea-2/medium-turbo", _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large", } @@ -57,7 +59,7 @@ _UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F def _krea_model_inputs() -> list: - """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo.""" + """Nested inputs shared by Krea 2 Medium, Medium Turbo and Large under the DynamicCombo.""" return [ IO.Combo.Input( "aspect_ratio", @@ -123,6 +125,7 @@ class Krea2ImageNode(IO.ComfyNode): "model", options=[ IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()), + IO.DynamicCombo.Option(_MODEL_MEDIUM_TURBO, _krea_model_inputs()), IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()), ], tooltip="Krea 2 Medium is best for expressive illustrations; " @@ -151,14 +154,15 @@ class Krea2ImageNode(IO.ComfyNode): ), expr=""" ( - $isLarge := widgets.model = "krea 2 large"; + $rates := { + "krea 2 medium turbo": {"text": 0.015, "style": 0.0175, "moodboard": 0.02}, + "krea 2 medium": {"text": 0.03, "style": 0.035, "moodboard": 0.04}, + "krea 2 large": {"text": 0.06, "style": 0.065, "moodboard": 0.07} + }; + $r := $lookup($rates, widgets.model); $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0; $hasStyle := $lookup(inputs, "model.style_reference").connected; - $usd := $hasMoodboard - ? ($isLarge ? 0.07 : 0.04) - : ($hasStyle - ? ($isLarge ? 0.065 : 0.035) - : ($isLarge ? 0.06 : 0.03)); + $usd := $hasMoodboard ? $r.moodboard : ($hasStyle ? $r.style : $r.text); {"type":"usd","usd": $usd} ) """, diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index ff078f74c..1dc97ecd7 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -158,7 +158,7 @@ class SaveAudio(IO.ComfyNode): return IO.Schema( node_id="SaveAudio", search_aliases=["export flac"], - display_name="Save Audio (FLAC)", + display_name="Save Audio (FLAC) (Deprecated)", category="audio", essentials_category="Audio", inputs=[ @@ -167,6 +167,7 @@ class SaveAudio(IO.ComfyNode): ], hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], is_output_node=True, + is_deprecated=True, ) @classmethod @@ -186,7 +187,7 @@ class SaveAudioMP3(IO.ComfyNode): return IO.Schema( node_id="SaveAudioMP3", search_aliases=["export mp3"], - display_name="Save Audio (MP3)", + display_name="Save Audio (MP3) (Deprecated)", category="audio", essentials_category="Audio", inputs=[ @@ -196,6 +197,7 @@ class SaveAudioMP3(IO.ComfyNode): ], hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], is_output_node=True, + is_deprecated=True, ) @classmethod @@ -217,7 +219,7 @@ class SaveAudioOpus(IO.ComfyNode): return IO.Schema( node_id="SaveAudioOpus", search_aliases=["export opus"], - display_name="Save Audio (Opus)", + display_name="Save Audio (Opus) (Deprecated)", category="audio", inputs=[ IO.Audio.Input("audio"), @@ -226,6 +228,7 @@ class SaveAudioOpus(IO.ComfyNode): ], hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], is_output_node=True, + is_deprecated=True, ) @classmethod @@ -241,6 +244,54 @@ class SaveAudioOpus(IO.ComfyNode): save_opus = execute # TODO: remove +class SaveAudioAdvanced(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SaveAudioAdvanced", + search_aliases=["save audio", "export audio", "output audio", "write audio", "flac", "mp3", "opus"], + display_name="Save Audio (Advanced)", + description="Saves the input audio to your ComfyUI output directory.", + category="audio", + inputs=[ + IO.Audio.Input("audio", tooltip="The audio to save."), + IO.String.Input( + "filename_prefix", + default="audio/ComfyUI", + tooltip=( + "The prefix for the file to save. May include formatting tokens " + "such as %date:yyyy-MM-dd%." + ), + ), + IO.DynamicCombo.Input( + "format", + options=[ + IO.DynamicCombo.Option("flac", []), + IO.DynamicCombo.Option("mp3", [ + IO.Combo.Input("quality", options=["V0", "128k", "320k"], default="V0"), + ]), + IO.DynamicCombo.Option("opus", [ + IO.Combo.Input("quality", options=["64k", "96k", "128k", "192k", "320k"], default="128k"), + ]), + ], + tooltip="The file format in which to save the audio.", + ), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) + + @classmethod + def execute(cls, audio, filename_prefix: str, format: dict) -> IO.NodeOutput: + file_format = format.get("format", None) + quality = format.get("quality", None) + if quality: + ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format, quality=quality) + else: + ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format) + return IO.NodeOutput(ui=ui) + + class PreviewAudio(IO.ComfyNode): @classmethod def define_schema(cls): @@ -822,6 +873,7 @@ class AudioExtension(ComfyExtension): SaveAudio, SaveAudioMP3, SaveAudioOpus, + SaveAudioAdvanced, LoadAudio, PreviewAudio, ConditioningStableAudio, diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 2f4ff1f70..3e97084a4 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -933,9 +933,10 @@ class Guider_DualModel(comfy.samplers.CFGGuider): def predict_noise(self, x, timestep, model_options={}, seed=None): positive = self.conds.get("positive", None) - if self.uncond_inner is None: # cfg == 1 or no negative -> single model, cond only - return comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0] cond = comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0] + # uncond model not loaded (base cfg==1/no negative), or cfg driven to 1.0 this step -> single model, cond only + if self.uncond_inner is None or (math.isclose(self.cfg, 1.0) and not model_options.get("disable_cfg1_optimization", False)): + return cond uncond_model_options = model_options if "multigpu_clones" in model_options: # TODO: support multigpu instead of just running uncond on a single GPU @@ -1140,7 +1141,7 @@ class CFGOverride(io.ComfyNode): return io.Schema( node_id="CFGOverride", display_name="CFG Override", - description="Override cfg to a fixed value over a [start, end] percent slice of the steps. " + description="Override cfg to a fixed value over a [start, end] percent (sigma) range. " "With multiple overrides, the one nearest the sampler wins on overlap.", category="sampling/custom_sampling", inputs=[ diff --git a/requirements.txt b/requirements.txt index 79d38fc06..613553d8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -comfyui-frontend-package==1.44.19 -comfyui-workflow-templates==0.9.94 +comfyui-frontend-package==1.45.15 +comfyui-workflow-templates==0.9.98 comfyui-embedded-docs==0.5.2 torch torchsde @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=16.0.0 comfy-kitchen==0.2.10 -comfy-aimdo==0.4.8 +comfy-aimdo==0.4.9 requests simpleeval>=1.0.0 blake3