Merge branch 'master' into fix/amd_rocm_qwen35_reference_image_segfault_fix

2026-07-21 23:41:28 +08:00 · 2026-06-05 18:51:44 +00:00
parent 2be2a9dad3 4a00126e9c
commit bd00da2f68
15 changed files with 708 additions and 89 deletions
@@ -174,7 +174,7 @@ class Ideogram4Transformer(nn.Module):
            llm = self.llm_cond_proj(llm) * text_mask
            h[:, :L_text] = h[:, :L_text] + llm

-        h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long))
+        h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype)

        # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch).
        freqs_cis = precompute_freqs_cis(
@@ -235,7 +235,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer):
    def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options):
        B = x_chunk.shape[0]
        device = x_chunk.device
-        img_tokens = self._img_to_tokens(x_chunk).to(self.dtype)
+        img_tokens = self._img_to_tokens(x_chunk)
        L_img = img_tokens.shape[1]
        L_text = context_chunk.shape[1]
        L = L_text + L_img
@@ -268,7 +268,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer):
    def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options):
        B = x_chunk.shape[0]
        device = x_chunk.device
-        img_tokens = self._img_to_tokens(x_chunk).to(self.dtype)
+        img_tokens = self._img_to_tokens(x_chunk)
        L_img = img_tokens.shape[1]

        position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3)
@@ -651,8 +651,7 @@ def ensure_pin_budget(size, evict_active=False):
    to_free = shortfall + PIN_PRESSURE_HYSTERESIS
    return free_pins(to_free, evict_active=evict_active) >= shortfall

-def ensure_pin_registerable(size, evict_active=True):
-    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+def free_registrations(shortfall, evict_active=True):
    if MAX_PINNED_MEMORY <= 0:
        return False
    if shortfall <= 0:
@@ -674,6 +673,9 @@ def ensure_pin_registerable(size, evict_active=True):
                    return True
    return shortfall <= REGISTERABLE_PIN_HYSTERESIS

+def ensure_pin_registerable(size, evict_active=True):
+    return free_registrations(TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY, evict_active=evict_active)
+
 class LoadedModel:
    def __init__(self, model: ModelPatcher):
        self._set_model(model)
@@ -89,13 +89,26 @@ def pin_memory(module, subset="weights", size=None):
        not comfy.model_management.ensure_pin_registerable(registerable_size)):
        return _steal_pin(module, stack, buckets, size, priority)

+    extended = False
    try:
-        hostbuf.extend(size=size)
+        hostbuf.extend(size=size, register=False)
+        extended = True
+        pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
+        pin.untyped_storage()._comfy_hostbuf = hostbuf
+        if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+            comfy.model_management.discard_cuda_async_error()
+            comfy.model_management.free_registrations(size)
+            if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+                comfy.model_management.discard_cuda_async_error()
+                del pin
+                hostbuf.truncate(offset, do_unregister=False)
+                return _steal_pin(module, stack, buckets, size, priority)
    except RuntimeError:
+        if extended:
+            hostbuf.truncate(offset, do_unregister=False)
        return _steal_pin(module, stack, buckets, size, priority)

-    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
-    module._pin.untyped_storage()._comfy_hostbuf = hostbuf
+    module._pin = pin
    stack.append((module, offset))
    module._pin_registered = True
    module._pin_stack_index = len(stack) - 1
@@ -285,7 +285,7 @@ class AudioSaveHelper:
        results = []
        for batch_number, waveform in enumerate(audio["waveform"].cpu()):
            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
-            file = f"{filename_with_batch_num}_{counter:05}_.{format}"
+            file = f"{filename_with_batch_num}_{counter:05}.{format}"
            output_path = os.path.join(full_output_folder, file)

            # Use original sample rate initially
@@ -43,6 +43,7 @@ class BFLFluxEraseRequest(BaseModel):
        "white (255) marks areas to remove, black (0) marks areas to preserve.",
    )
    dilate_pixels: int = Field(10)
+    seed: int | None = Field(None)
    output_format: str = Field("png")


@@ -97,3 +97,28 @@ class BriaRemoveVideoBackgroundResult(BaseModel):
 class BriaRemoveVideoBackgroundResponse(BaseModel):
    status: str = Field(...)
    result: BriaRemoveVideoBackgroundResult | None = Field(None)
+
+
+class BriaVideoGreenScreenRequest(BaseModel):
+    video: str = Field(..., description="Publicly accessible URL of the input video.")
+    green_shade: str = Field(
+        default="broadcast_green",
+        description="Solid chroma-key shade applied behind the foreground "
+        "(broadcast_green, chroma_green, or blue_screen).",
+    )
+    output_container_and_codec: str = Field(...)
+    preserve_audio: bool = Field(True)
+    seed: int = Field(...)
+
+
+class BriaVideoReplaceBackgroundRequest(BaseModel):
+    video: str = Field(..., description="Publicly accessible URL of the input (foreground) video.")
+    background_url: str = Field(
+        ...,
+        description="Publicly accessible URL of the background image or video to composite behind "
+        "the foreground. Stretched to the foreground frame; match its aspect ratio for "
+        "undistorted results.",
+    )
+    output_container_and_codec: str = Field(...)
+    preserve_audio: bool = Field(True)
+    seed: int = Field(...)
@@ -108,13 +108,19 @@ class GeminiVideoMetadata(BaseModel):
    startOffset: GeminiOffset | None = Field(None)


+class GeminiThinkingConfig(BaseModel):
+    includeThoughts: bool | None = Field(None)
+    thinkingLevel: str = Field(...)
+
+
 class GeminiGenerationConfig(BaseModel):
-    maxOutputTokens: int | None = Field(None, ge=16, le=8192)
+    maxOutputTokens: int | None = Field(None, ge=16, le=65536)
    seed: int | None = Field(None)
    stopSequences: list[str] | None = Field(None)
    temperature: float | None = Field(None, ge=0.0, le=2.0)
    topK: int | None = Field(None, ge=1)
    topP: float | None = Field(None, ge=0.0, le=1.0)
+    thinkingConfig: GeminiThinkingConfig | None = Field(None)


 class GeminiImageOutputOptions(BaseModel):
@@ -128,11 +134,6 @@ class GeminiImageConfig(BaseModel):
    imageOutputOptions: GeminiImageOutputOptions = Field(default_factory=GeminiImageOutputOptions)


-class GeminiThinkingConfig(BaseModel):
-    includeThoughts: bool | None = Field(None)
-    thinkingLevel: str = Field(...)
-
-
 class GeminiImageGenerationConfig(GeminiGenerationConfig):
    responseModalities: list[str] | None = Field(None)
    imageConfig: GeminiImageConfig | None = Field(None)
@@ -534,6 +534,15 @@ class FluxEraseNode(IO.ComfyNode):
                    max=25,
                    tooltip="Expands the mask boundaries to ensure clean coverage of the object's edges.",
                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    control_after_generate=True,
+                    tooltip="The random seed used for creating the noise.",
+                    optional=True,
+                ),
            ],
            outputs=[IO.Image.Output()],
            hidden=[
@@ -553,6 +562,7 @@ class FluxEraseNode(IO.ComfyNode):
        image: Input.Image,
        mask: Input.Image,
        dilate_pixels: int = 10,
+        seed: int = 0,
    ) -> IO.NodeOutput:
        validate_image_dimensions(image, min_width=256, min_height=256)
        mask = resize_mask_to_image(mask, image)
@@ -565,6 +575,7 @@ class FluxEraseNode(IO.ComfyNode):
                image=tensor_to_base64_string(image[:, :, :, :3]),  # make sure image will have alpha channel removed
                mask=mask,
                dilate_pixels=dilate_pixels,
+                seed=seed,
            ),
        )

@@ -12,6 +12,8 @@ from comfy_api_nodes.apis.bria import (
    BriaRemoveVideoBackgroundRequest,
    BriaRemoveVideoBackgroundResponse,
    BriaStatusResponse,
+    BriaVideoGreenScreenRequest,
+    BriaVideoReplaceBackgroundRequest,
    InputModerationSettings,
 )
 from comfy_api_nodes.util import (
@@ -319,6 +321,158 @@ class BriaRemoveVideoBackground(IO.ComfyNode):
        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))


+class BriaVideoGreenScreen(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaVideoGreenScreen",
+            display_name="Bria Video Green Screen",
+            category="partner/video/Bria",
+            description="Replace a video's background with a solid chroma-key screen using Bria.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Combo.Input(
+                    "green_shade",
+                    options=["broadcast_green", "chroma_green", "blue_screen"],
+                    tooltip="Solid chroma-key shade applied behind the foreground: "
+                    "broadcast_green (#00B140), chroma_green (#00FF00), or blue_screen (#0000FF).",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        green_shade: str,
+        seed: int,
+    ) -> IO.NodeOutput:
+        validate_video_duration(video, max_duration=60.0)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bria/v2/video/edit/green_screen", method="POST"),
+            data=BriaVideoGreenScreenRequest(
+                video=await upload_video_to_comfyapi(cls, video),
+                green_shade=green_shade,
+                output_container_and_codec="mp4_h264",
+                seed=seed,
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaRemoveVideoBackgroundResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))
+
+
+class BriaVideoReplaceBackground(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaVideoReplaceBackground",
+            display_name="Bria Video Replace Background",
+            category="partner/video/Bria",
+            description="Replace a video's background with a supplied image or video using Bria. "
+            "The output keeps the foreground's resolution and frame rate; a background with a "
+            "different aspect ratio is stretched to fit, so match it for undistorted results.",
+            inputs=[
+                IO.Video.Input("video", tooltip="Foreground video whose background is replaced."),
+                IO.Image.Input(
+                    "background_image",
+                    optional=True,
+                    tooltip="Background image to composite behind the foreground. "
+                    "Provide either a background image or a background video, not both.",
+                ),
+                IO.Video.Input(
+                    "background_video",
+                    optional=True,
+                    tooltip="Background video to composite behind the foreground. "
+                    "Provide either a background image or a background video, not both.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        seed: int,
+        background_image: Input.Image | None = None,
+        background_video: Input.Video | None = None,
+    ) -> IO.NodeOutput:
+        if (background_image is None) == (background_video is None):
+            raise ValueError("Provide either a background image or a background video, not both.")
+        validate_video_duration(video, max_duration=60.0)
+        if background_video is not None:
+            validate_video_duration(background_video, max_duration=60.0)
+            background_url = await upload_video_to_comfyapi(cls, background_video, wait_label="Uploading background")
+        else:
+            background_url = await upload_image_to_comfyapi(cls, background_image, wait_label="Uploading background")
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bria/v2/video/edit/replace_background", method="POST"),
+            data=BriaVideoReplaceBackgroundRequest(
+                video=await upload_video_to_comfyapi(cls, video),
+                background_url=background_url,
+                output_container_and_codec="mp4_h264",
+                seed=seed,
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaRemoveVideoBackgroundResponse,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))
+
+
 def _video_to_images_and_mask(video: Input.Video) -> tuple[Input.Image, Input.Mask]:
    """Decode a transparent webm (VP9 + alpha) into image frames and an alpha mask.

@@ -416,6 +570,8 @@ class BriaExtension(ComfyExtension):
            BriaImageEditNode,
            BriaRemoveImageBackground,
            BriaRemoveVideoBackground,
+            BriaVideoGreenScreen,
+            # BriaVideoReplaceBackground,  # server returns Status 500 when we pass background video
            BriaTransparentVideoBackground,
        ]

@@ -7,6 +7,7 @@ from io import BytesIO
 import torch
 from typing_extensions import override

+from comfy.utils import common_upscale
 from comfy_api.latest import IO, ComfyExtension, Input, Types
 from comfy_api_nodes.apis.bytedance import (
    RECOMMENDED_PRESETS,
@@ -131,6 +132,44 @@ def _prepare_seedance_image(image: Input.Image) -> Input.Image:
    return image


+# Supported output aspect ratios, used to pre-size FLF frames to matching pixel pair to avoid the 1080p stretch jump.
+SEEDANCE2_RATIO_WH = {
+    "16:9": (16, 9),
+    "4:3": (4, 3),
+    "1:1": (1, 1),
+    "3:4": (3, 4),
+    "9:16": (9, 16),
+    "21:9": (21, 9),
+}
+SEEDANCE2_RES_SHORT_SIDE = {"480p": 480, "720p": 720, "1080p": 1080}
+
+
+def _seedance2_target_dims(resolution: str, ratio: str, image: torch.Tensor) -> tuple[int, int]:
+    """Exact supported output (width, height) for (resolution, ratio).
+
+    The shorter side equals the resolution number (e.g. 1080p 16:9 -> 1920x1080). For ratio
+    "adaptive" (or any unexpected value) the ratio is derived from the image's own aspect, snapped
+    to the nearest supported ratio, so the output keeps the frame's orientation.
+    """
+    short = SEEDANCE2_RES_SHORT_SIDE[resolution]
+    if ratio not in SEEDANCE2_RATIO_WH:
+        aspect = image.shape[-2] / image.shape[-3]  # W / H; tensor is (B, H, W, C)
+        ratio = min(SEEDANCE2_RATIO_WH, key=lambda k: abs(SEEDANCE2_RATIO_WH[k][0] / SEEDANCE2_RATIO_WH[k][1] - aspect))
+    rw, rh = SEEDANCE2_RATIO_WH[ratio]
+    if rw >= rh:  # landscape or square: shorter side is the height
+        out_w, out_h = round(short * rw / rh), short
+    else:  # portrait: shorter side is the width
+        out_w, out_h = short, round(short * rh / rw)
+    return out_w - out_w % 2, out_h - out_h % 2
+
+
+def _resize_to_exact(image: torch.Tensor, width: int, height: int) -> torch.Tensor:
+    """Center-crop to the target aspect and resize to exactly width x height (lanczos)."""
+    samples = image.movedim(-1, 1)  # (B, H, W, C) -> (B, C, H, W)
+    resized = common_upscale(samples, width, height, "lanczos", "center")
+    return resized.movedim(1, -1)
+
+
 async def _resolve_reference_assets(
    cls: type[IO.ComfyNode],
    asset_ids: list[str],
@@ -1790,10 +1829,28 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
        if last_frame is not None and last_frame_asset_id:
            raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.")

-        if first_frame is not None:
-            first_frame = _prepare_seedance_image(first_frame)
-        if last_frame is not None:
-            last_frame = _prepare_seedance_image(last_frame)
+        request_ratio = model["ratio"]
+        if first_frame_asset_id or last_frame_asset_id:
+            if first_frame is not None:
+                first_frame = _prepare_seedance_image(first_frame)
+            if last_frame is not None:
+                last_frame = _prepare_seedance_image(last_frame)
+        else:
+            # The 1080p FLF stretch fix (pre-size frames to a supported pixel pair + submit ratio="adaptive")
+            # only applies to local image inputs we can resize.
+            request_ratio = "adaptive"
+            target_dims: tuple[int, int] | None = None
+            if first_frame is not None:
+                validate_image_aspect_ratio(first_frame, (2, 5), (5, 2), strict=False)  # 0.4 to 2.5
+                validate_image_dimensions(first_frame, min_width=300, min_height=300)
+                target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], first_frame)
+                first_frame = _resize_to_exact(first_frame, *target_dims)
+            if last_frame is not None:
+                validate_image_aspect_ratio(last_frame, (2, 5), (5, 2), strict=False)  # 0.4 to 2.5
+                validate_image_dimensions(last_frame, min_width=300, min_height=300)
+                if target_dims is None:
+                    target_dims = _seedance2_target_dims(model["resolution"], model["ratio"], last_frame)
+                last_frame = _resize_to_exact(last_frame, *target_dims)

        asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a]
        image_assets: dict[str, str] = {}
@@ -1844,7 +1901,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
                content=content,
                generate_audio=model["generate_audio"],
                resolution=model["resolution"],
-                ratio=model["ratio"],
+                ratio=request_ratio,
                duration=model["duration"],
                seed=seed,
                watermark=watermark,
@@ -8,7 +8,7 @@ import os
 from enum import Enum
 from fnmatch import fnmatch
 from io import BytesIO
-from typing import Literal
+from typing import Any, Literal

 import torch
 from typing_extensions import override
@@ -19,6 +19,7 @@ from comfy_api_nodes.apis.gemini import (
    GeminiContent,
    GeminiFileData,
    GeminiGenerateContentRequest,
+    GeminiGenerationConfig,
    GeminiGenerateContentResponse,
    GeminiImageConfig,
    GeminiImageGenerateContentRequest,
@@ -40,13 +41,18 @@ from comfy_api_nodes.util import (
    get_number_of_images,
    sync_op,
    tensor_to_base64_string,
+    upload_audio_to_comfyapi,
+    upload_image_to_comfyapi,
    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
    validate_string,
    video_to_base64_string,
 )

 GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
 GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024  # 20 MB
+GEMINI_URL_INPUT_BUDGET = 10
+GEMINI_MAX_INLINE_BYTES = 18 * 1024 * 1024
 GEMINI_IMAGE_SYS_PROMPT = (
    "You are an expert image-generation engine. You must ALWAYS produce an image.\n"
    "Interpret all user input—regardless of "
@@ -285,6 +291,140 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
    return final_price / 1_000_000.0


+def create_video_parts(video_input: Input.Video) -> list[GeminiPart]:
+    """Convert a single video input to Gemini API compatible parts (inline MP4/H.264)."""
+    base_64_string = video_to_base64_string(
+        video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264
+    )
+    return [
+        GeminiPart(
+            inlineData=GeminiInlineData(
+                mimeType=GeminiMimeType.video_mp4,
+                data=base_64_string,
+            )
+        )
+    ]
+
+
+def create_audio_parts(audio_input: Input.Audio) -> list[GeminiPart]:
+    """Convert an audio input to Gemini API compatible parts (one inline MP3 part per batch item)."""
+    audio_parts: list[GeminiPart] = []
+    for batch_index in range(audio_input["waveform"].shape[0]):
+        # Recreate an IO.AUDIO object for the given batch dimension index
+        audio_at_index = Input.Audio(
+            waveform=audio_input["waveform"][batch_index].unsqueeze(0),
+            sample_rate=audio_input["sample_rate"],
+        )
+        # Convert to MP3 format for compatibility with Gemini API
+        audio_bytes = audio_to_base64_string(
+            audio_at_index,
+            container_format="mp3",
+            codec_name="libmp3lame",
+        )
+        audio_parts.append(
+            GeminiPart(
+                inlineData=GeminiInlineData(
+                    mimeType=GeminiMimeType.audio_mp3,
+                    data=audio_bytes,
+                )
+            )
+        )
+    return audio_parts
+
+
+def _flatten_images(images: list[Input.Image]) -> list[torch.Tensor]:
+    """Expand any batched image tensors into individual (H, W, C) frames, preserving order."""
+    frames: list[torch.Tensor] = []
+    for img in images:
+        if len(img.shape) == 4:
+            frames.extend(img[i] for i in range(img.shape[0]))
+        else:
+            frames.append(img)
+    return frames
+
+
+def _flatten_audio(audios: list[Input.Audio]) -> list[Input.Audio]:
+    """Expand any batched audio inputs into individual single-clip audio inputs, preserving order."""
+    clips: list[Input.Audio] = []
+    for audio in audios:
+        waveform = audio["waveform"]
+        for i in range(waveform.shape[0]):
+            clips.append(Input.Audio(waveform=waveform[i].unsqueeze(0), sample_rate=audio["sample_rate"]))
+    return clips
+
+
+async def _media_url_part(cls: type[IO.ComfyNode], kind: str, payload: Any) -> GeminiPart:
+    """Upload a single media unit to ComfyAPI storage and return a fileData (URL) part."""
+    if kind == "image":
+        url = await upload_image_to_comfyapi(cls, payload, mime_type="image/png", wait_label="Uploading image")
+        return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.image_png, fileUri=url))
+    if kind == "audio":
+        url = await upload_audio_to_comfyapi(
+            cls, payload, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mp3"
+        )
+        return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.audio_mp3, fileUri=url))
+    url = await upload_video_to_comfyapi(cls, payload, wait_label="Uploading video")
+    return GeminiPart(fileData=GeminiFileData(mimeType=GeminiMimeType.video_mp4, fileUri=url))
+
+
+def _media_inline_part(kind: str, payload: Any) -> tuple[GeminiPart, int]:
+    """Encode a single media unit as an inline base64 part; returns (part, base64_length)."""
+    if kind == "image":
+        data = tensor_to_base64_string(payload, mime_type="image/webp")
+        mime = GeminiMimeType.image_webp
+    elif kind == "audio":
+        data = audio_to_base64_string(payload, container_format="mp3", codec_name="libmp3lame")
+        mime = GeminiMimeType.audio_mp3
+    else:
+        data = video_to_base64_string(
+            payload, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264
+        )
+        mime = GeminiMimeType.video_mp4
+    return GeminiPart(inlineData=GeminiInlineData(mimeType=mime, data=data)), len(data)
+
+
+async def build_gemini_media_parts(
+    cls: type[IO.ComfyNode],
+    images: list[Input.Image],
+    audios: list[Input.Audio],
+    videos: list[Input.Video],
+    *,
+    url_budget: int = GEMINI_URL_INPUT_BUDGET,
+    max_inline_bytes: int = GEMINI_MAX_INLINE_BYTES,
+) -> list[GeminiPart]:
+    """Build Gemini parts for multimodal inputs (images, audio, video).
+
+    fileData URLs are preferred for every media type: the upload is fetched directly by the
+    model, keeping the request body tiny regardless of media size. The URL budget is shared
+    across all media and assigned largest-first (video, then audio, then images), so that if it
+    is ever exhausted the inline-base64 overflow is limited to the smallest items. Total inline
+    payload is capped by `max_inline_bytes`.
+    """
+    units: list[tuple[str, Any]] = (
+        [("video", v) for v in videos]
+        + [("audio", a) for a in _flatten_audio(audios)]
+        + [("image", f) for f in _flatten_images(images)]
+    )
+
+    parts: list[GeminiPart] = []
+    url_used = 0
+    inline_bytes = 0
+    for kind, payload in units:
+        if url_used < url_budget:
+            parts.append(await _media_url_part(cls, kind, payload))
+            url_used += 1
+            continue
+        part, nbytes = _media_inline_part(kind, payload)
+        inline_bytes += nbytes
+        if inline_bytes > max_inline_bytes:
+            raise ValueError(
+                f"Too much media to send inline (over {max_inline_bytes // (1024 * 1024)}MB after the first "
+                f"{url_budget} inputs are uploaded as URLs). Reduce the number or size of attached media."
+            )
+        parts.append(part)
+    return parts
+
+
 class GeminiNode(IO.ComfyNode):
    """
    Node to generate text responses from a Gemini model.
@@ -407,58 +547,9 @@ class GeminiNode(IO.ComfyNode):
                )
                """,
            ),
+            is_deprecated=True,
        )

-    @classmethod
-    def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]:
-        """Convert video input to Gemini API compatible parts."""
-
-        base_64_string = video_to_base64_string(
-            video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264
-        )
-        return [
-            GeminiPart(
-                inlineData=GeminiInlineData(
-                    mimeType=GeminiMimeType.video_mp4,
-                    data=base_64_string,
-                )
-            )
-        ]
-
-    @classmethod
-    def create_audio_parts(cls, audio_input: Input.Audio) -> list[GeminiPart]:
-        """
-        Convert audio input to Gemini API compatible parts.
-
-        Args:
-            audio_input: Audio input from ComfyUI, containing waveform tensor and sample rate.
-
-        Returns:
-            List of GeminiPart objects containing the encoded audio.
-        """
-        audio_parts: list[GeminiPart] = []
-        for batch_index in range(audio_input["waveform"].shape[0]):
-            # Recreate an IO.AUDIO object for the given batch dimension index
-            audio_at_index = Input.Audio(
-                waveform=audio_input["waveform"][batch_index].unsqueeze(0),
-                sample_rate=audio_input["sample_rate"],
-            )
-            # Convert to MP3 format for compatibility with Gemini API
-            audio_bytes = audio_to_base64_string(
-                audio_at_index,
-                container_format="mp3",
-                codec_name="libmp3lame",
-            )
-            audio_parts.append(
-                GeminiPart(
-                    inlineData=GeminiInlineData(
-                        mimeType=GeminiMimeType.audio_mp3,
-                        data=audio_bytes,
-                    )
-                )
-            )
-        return audio_parts
-
    @classmethod
    async def execute(
        cls,
@@ -482,9 +573,9 @@ class GeminiNode(IO.ComfyNode):
        if images is not None:
            parts.extend(await create_image_parts(cls, images))
        if audio is not None:
-            parts.extend(cls.create_audio_parts(audio))
+            parts.extend(create_audio_parts(audio))
        if video is not None:
-            parts.extend(cls.create_video_parts(video))
+            parts.extend(create_video_parts(video))
        if files is not None:
            parts.extend(files)

@@ -512,6 +603,210 @@ class GeminiNode(IO.ComfyNode):
        return IO.NodeOutput(output_text or "Empty response from Gemini model...")


+GEMINI_V2_MODELS: dict[str, str] = {
+    "Gemini 3.1 Pro": "gemini-3.1-pro-preview",
+    "Gemini 3.1 Flash-Lite": "gemini-3.1-flash-lite-preview",
+}
+
+
+def _gemini_text_model_inputs(thinking_default: str) -> list[Input]:
+    """Per-model inputs revealed by the model DynamicCombo (shared media + sampling controls)."""
+    return [
+        IO.Autogrow.Input(
+            "images",
+            template=IO.Autogrow.TemplateNames(
+                IO.Image.Input("image"),
+                names=[f"image_{i}" for i in range(1, 17)],
+                min=0,
+            ),
+            tooltip="Optional image(s) to use as context for the model. Up to 16 images.",
+        ),
+        IO.Autogrow.Input(
+            "audio",
+            template=IO.Autogrow.TemplateNames(
+                IO.Audio.Input("audio"),
+                names=["audio_1"],
+                min=0,
+            ),
+            tooltip="Optional audio clip to use as context for the model.",
+        ),
+        IO.Autogrow.Input(
+            "video",
+            template=IO.Autogrow.TemplateNames(
+                IO.Video.Input("video"),
+                names=["video_1"],
+                min=0,
+            ),
+            tooltip="Optional video clip to use as context for the model.",
+        ),
+        IO.Custom("GEMINI_INPUT_FILES").Input(
+            "files",
+            optional=True,
+            tooltip="Optional file(s) to use as context for the model. "
+            "Accepts inputs from the Gemini Input Files node.",
+        ),
+        IO.Combo.Input(
+            "thinking_level",
+            options=["LOW", "HIGH"],
+            default=thinking_default,
+            tooltip="How hard the model reasons internally before answering. "
+            "HIGH improves quality on difficult tasks but costs more (thinking) tokens and is slower.",
+        ),
+        IO.Float.Input(
+            "temperature",
+            default=1.0,
+            min=0.0,
+            max=2.0,
+            step=0.01,
+            tooltip="Controls randomness. Lower is more focused/deterministic, higher is more creative.",
+            advanced=True,
+        ),
+        IO.Float.Input(
+            "top_p",
+            default=0.95,
+            min=0.0,
+            max=1.0,
+            step=0.01,
+            tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.",
+            advanced=True,
+        ),
+        IO.Int.Input(
+            "max_output_tokens",
+            default=32768,
+            min=16,
+            max=65536,
+            tooltip="Maximum tokens to generate, including the model's internal thinking. "
+            "With thinking_level HIGH, a low value can leave no room for the answer; raise this if "
+            "responses come back empty or truncated. The model stops early when finished, so a higher "
+            "cap costs nothing extra for short replies.",
+            advanced=True,
+        ),
+    ]
+
+
+class GeminiNodeV2(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GeminiNodeV2",
+            display_name="Google Gemini",
+            category="partner/text/Gemini",
+            essentials_category="Text Generation",
+            description="Generate text responses with Google's Gemini models. Provide a text prompt and, "
+            "optionally, one or more images, audio clips, videos, or files as multimodal context.",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Text input to the model. Include detailed instructions, questions, or context.",
+                ),
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=[
+                        IO.DynamicCombo.Option("Gemini 3.1 Pro", _gemini_text_model_inputs("HIGH")),
+                        IO.DynamicCombo.Option("Gemini 3.1 Flash-Lite", _gemini_text_model_inputs("LOW")),
+                    ],
+                    tooltip="The Gemini model used to generate the response.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=42,
+                    min=0,
+                    max=2147483647,
+                    control_after_generate=True,
+                    tooltip="Seed for sampling. Set to 0 for a random seed. Deterministic output isn't guaranteed.",
+                ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default="",
+                    optional=True,
+                    advanced=True,
+                    tooltip="Foundational instructions that dictate the model's behavior.",
+                ),
+            ],
+            outputs=[
+                IO.String.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+                expr="""
+                (
+                  $m := widgets.model;
+                  $contains($m, "lite") ? {
+                    "type": "list_usd",
+                    "usd": [0.00025, 0.0015],
+                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+                  } : {
+                    "type": "list_usd",
+                    "usd": [0.002, 0.012],
+                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+                  }
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        model: dict,
+        seed: int,
+        system_prompt: str = "",
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        model_id = GEMINI_V2_MODELS[model["model"]]
+
+        parts: list[GeminiPart] = [GeminiPart(text=prompt)]
+        images = [t for t in (model.get("images") or {}).values() if t is not None]
+        audios = [a for a in (model.get("audio") or {}).values() if a is not None]
+        videos = [v for v in (model.get("video") or {}).values() if v is not None]
+        if images or audios or videos:
+            parts.extend(await build_gemini_media_parts(cls, images, audios, videos))
+        files = model.get("files")
+        if files is not None:
+            parts.extend(files)
+
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
+        response = await sync_op(
+            cls,
+            endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"),
+            data=GeminiGenerateContentRequest(
+                contents=[
+                    GeminiContent(
+                        role=GeminiRole.user,
+                        parts=parts,
+                    )
+                ],
+                generationConfig=GeminiGenerationConfig(
+                    temperature=model["temperature"],
+                    topP=model["top_p"],
+                    maxOutputTokens=model["max_output_tokens"],
+                    seed=seed if seed > 0 else None,
+                    thinkingConfig=GeminiThinkingConfig(thinkingLevel=model["thinking_level"]),
+                ),
+                systemInstruction=gemini_system_prompt,
+            ),
+            response_model=GeminiGenerateContentResponse,
+            price_extractor=calculate_tokens_price,
+        )
+
+        output_text = get_text_from_response(response)
+        return IO.NodeOutput(output_text or "Empty response from Gemini model...")
+
+
 class GeminiInputFiles(IO.ComfyNode):
    """
    Loads and formats input files for use with the Gemini API.
@@ -1222,6 +1517,7 @@ class GeminiExtension(ComfyExtension):
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            GeminiNode,
+            GeminiNodeV2,
            GeminiImage,
            GeminiImage2,
            GeminiNanoBanana2,
@@ -42,9 +42,11 @@ async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Ima


 _MODEL_MEDIUM = "Krea 2 Medium"
+_MODEL_MEDIUM_TURBO = "Krea 2 Medium Turbo"
 _MODEL_LARGE = "Krea 2 Large"
 _MODEL_ENDPOINTS: dict[str, str] = {
    _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium",
+    _MODEL_MEDIUM_TURBO: "/proxy/krea/generate/image/krea/krea-2/medium-turbo",
    _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large",
 }

@@ -57,7 +59,7 @@ _UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F


 def _krea_model_inputs() -> list:
-    """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo."""
+    """Nested inputs shared by Krea 2 Medium, Medium Turbo and Large under the DynamicCombo."""
    return [
        IO.Combo.Input(
            "aspect_ratio",
@@ -123,6 +125,7 @@ class Krea2ImageNode(IO.ComfyNode):
                    "model",
                    options=[
                        IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()),
+                        IO.DynamicCombo.Option(_MODEL_MEDIUM_TURBO, _krea_model_inputs()),
                        IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()),
                    ],
                    tooltip="Krea 2 Medium is best for expressive illustrations; "
@@ -151,14 +154,15 @@ class Krea2ImageNode(IO.ComfyNode):
                ),
                expr="""
                (
-                  $isLarge := widgets.model = "krea 2 large";
+                  $rates := {
+                    "krea 2 medium turbo": {"text": 0.015, "style": 0.0175, "moodboard": 0.02},
+                    "krea 2 medium": {"text": 0.03, "style": 0.035, "moodboard": 0.04},
+                    "krea 2 large": {"text": 0.06, "style": 0.065, "moodboard": 0.07}
+                  };
+                  $r := $lookup($rates, widgets.model);
                  $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0;
                  $hasStyle := $lookup(inputs, "model.style_reference").connected;
-                  $usd := $hasMoodboard
-                    ? ($isLarge ? 0.07 : 0.04)
-                    : ($hasStyle
-                        ? ($isLarge ? 0.065 : 0.035)
-                        : ($isLarge ? 0.06 : 0.03));
+                  $usd := $hasMoodboard ? $r.moodboard : ($hasStyle ? $r.style : $r.text);
                  {"type":"usd","usd": $usd}
                )
                """,
@@ -158,7 +158,7 @@ class SaveAudio(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudio",
            search_aliases=["export flac"],
-            display_name="Save Audio (FLAC)",
+            display_name="Save Audio (FLAC) (Deprecated)",
            category="audio",
            essentials_category="Audio",
            inputs=[
@@ -167,6 +167,7 @@ class SaveAudio(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -186,7 +187,7 @@ class SaveAudioMP3(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudioMP3",
            search_aliases=["export mp3"],
-            display_name="Save Audio (MP3)",
+            display_name="Save Audio (MP3) (Deprecated)",
            category="audio",
            essentials_category="Audio",
            inputs=[
@@ -196,6 +197,7 @@ class SaveAudioMP3(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -217,7 +219,7 @@ class SaveAudioOpus(IO.ComfyNode):
        return IO.Schema(
            node_id="SaveAudioOpus",
            search_aliases=["export opus"],
-            display_name="Save Audio (Opus)",
+            display_name="Save Audio (Opus) (Deprecated)",
            category="audio",
            inputs=[
                IO.Audio.Input("audio"),
@@ -226,6 +228,7 @@ class SaveAudioOpus(IO.ComfyNode):
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
            is_output_node=True,
+            is_deprecated=True,
        )

    @classmethod
@@ -241,6 +244,54 @@ class SaveAudioOpus(IO.ComfyNode):
    save_opus = execute  # TODO: remove


+class SaveAudioAdvanced(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="SaveAudioAdvanced",
+            search_aliases=["save audio", "export audio", "output audio", "write audio", "flac", "mp3", "opus"],
+            display_name="Save Audio (Advanced)",
+            description="Saves the input audio to your ComfyUI output directory.",
+            category="audio",
+            inputs=[
+                IO.Audio.Input("audio", tooltip="The audio to save."),
+                IO.String.Input(
+                    "filename_prefix",
+                    default="audio/ComfyUI",
+                    tooltip=(
+                        "The prefix for the file to save. May include formatting tokens "
+                        "such as %date:yyyy-MM-dd%."
+                    ),
+                ),
+                IO.DynamicCombo.Input(
+                    "format",
+                    options=[
+                        IO.DynamicCombo.Option("flac", []),
+                        IO.DynamicCombo.Option("mp3", [
+                            IO.Combo.Input("quality", options=["V0", "128k", "320k"], default="V0"),
+                        ]),
+                        IO.DynamicCombo.Option("opus", [
+                            IO.Combo.Input("quality", options=["64k", "96k", "128k", "192k", "320k"], default="128k"),
+                        ]),
+                    ],
+                    tooltip="The file format in which to save the audio.",
+                ),
+            ],
+            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
+            is_output_node=True,
+        )
+
+    @classmethod
+    def execute(cls, audio, filename_prefix: str, format: dict) -> IO.NodeOutput:
+        file_format = format.get("format", None)
+        quality = format.get("quality", None)
+        if quality:
+            ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format, quality=quality)
+        else:
+            ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=file_format)
+        return IO.NodeOutput(ui=ui)
+
+
 class PreviewAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -822,6 +873,7 @@ class AudioExtension(ComfyExtension):
            SaveAudio,
            SaveAudioMP3,
            SaveAudioOpus,
+            SaveAudioAdvanced,
            LoadAudio,
            PreviewAudio,
            ConditioningStableAudio,
@@ -933,9 +933,10 @@ class Guider_DualModel(comfy.samplers.CFGGuider):

    def predict_noise(self, x, timestep, model_options={}, seed=None):
        positive = self.conds.get("positive", None)
-        if self.uncond_inner is None:  # cfg == 1 or no negative -> single model, cond only
-            return comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0]
        cond = comfy.samplers.calc_cond_batch(self.inner_model, [positive], x, timestep, model_options)[0]
+        # uncond model not loaded (base cfg==1/no negative), or cfg driven to 1.0 this step -> single model, cond only
+        if self.uncond_inner is None or (math.isclose(self.cfg, 1.0) and not model_options.get("disable_cfg1_optimization", False)):
+            return cond

        uncond_model_options = model_options
        if "multigpu_clones" in model_options: # TODO: support multigpu instead of just running uncond on a single GPU
@@ -1140,7 +1141,7 @@ class CFGOverride(io.ComfyNode):
        return io.Schema(
            node_id="CFGOverride",
            display_name="CFG Override",
-            description="Override cfg to a fixed value over a [start, end] percent slice of the steps. "
+            description="Override cfg to a fixed value over a [start, end] percent (sigma) range. "
                        "With multiple overrides, the one nearest the sampler wins on overlap.",
            category="sampling/custom_sampling",
            inputs=[
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.44.19
-comfyui-workflow-templates==0.9.94
+comfyui-frontend-package==1.45.15
+comfyui-workflow-templates==0.9.98
 comfyui-embedded-docs==0.5.2
 torch
 torchsde
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=16.0.0
 comfy-kitchen==0.2.10
-comfy-aimdo==0.4.8
+comfy-aimdo==0.4.9
 requests
 simpleeval>=1.0.0
 blake3