[Partner Nodes] feat(Google): add Gemini Video Omni node (#14695)

2026-07-24 08:57:57 +08:00 · 2026-06-30 17:17:53 -04:00
parent 1c59659a2f
commit b70944e710
2 changed files with 174 additions and 1 deletions
@@ -121,6 +121,7 @@ class GeminiGenerationConfig(BaseModel):
    topK: int | None = Field(None, ge=1)
    topP: float | None = Field(None, ge=0.0, le=1.0)
    thinkingConfig: GeminiThinkingConfig | None = Field(None)
+    responseModalities: list[str] | None = Field(None)


 class GeminiImageOutputOptions(BaseModel):
@@ -13,7 +13,7 @@ import torch
 from typing_extensions import override

 import folder_paths
-from comfy_api.latest import IO, ComfyExtension, Input, Types
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types
 from comfy_api_nodes.apis.gemini import (
    GeminiContent,
    GeminiFileData,
@@ -37,6 +37,7 @@ from comfy_api_nodes.util import (
    audio_to_base64_string,
    bytesio_to_image_tensor,
    download_url_to_image_tensor,
+    download_url_to_video_output,
    get_number_of_images,
    sync_op,
    tensor_to_base64_string,
@@ -45,6 +46,7 @@ from comfy_api_nodes.util import (
    upload_images_to_comfyapi,
    upload_video_to_comfyapi,
    validate_string,
+    validate_video_duration,
    video_to_base64_string,
 )

@@ -229,10 +231,29 @@ async def get_image_from_response(response: GeminiGenerateContentResponse, thoug
    return torch.cat(image_tensors, dim=0)


+async def get_video_from_response(
+    response: GeminiGenerateContentResponse, cls: type[IO.ComfyNode] | None = None
+) -> InputImpl.VideoFromFile:
+    parts = get_parts_by_type(response, "video/*")
+    for part in parts:
+        if part.inlineData and part.inlineData.data:
+            return InputImpl.VideoFromFile(BytesIO(base64.b64decode(part.inlineData.data)))
+        if part.fileData and part.fileData.fileUri:
+            return await download_url_to_video_output(part.fileData.fileUri, cls=cls)
+    model_message = get_text_from_response(response).strip()
+    if model_message:
+        raise ValueError(f"Gemini did not generate a video. Model response: {model_message}")
+    raise ValueError(
+        "Gemini did not generate a video. Try rephrasing your prompt, "
+        "shortening the requested duration, or reducing the number of input images/videos."
+    )
+
+
 def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
    if not response.modelVersion:
        return None
    # Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
+    output_video_tokens_price = 0.0
    if response.modelVersion == "gemini-2.5-pro":
        input_tokens_price = 1.25
        output_text_tokens_price = 10.0
@@ -265,6 +286,11 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
        input_tokens_price = 0.25
        output_text_tokens_price = 1.50
        output_image_tokens_price = 30.0
+    elif response.modelVersion == "gemini-omni-flash-preview":
+        input_tokens_price = 2.145
+        output_text_tokens_price = 12.87
+        output_image_tokens_price = 0.0
+        output_video_tokens_price = 25.025
    else:
        return None
    final_price = response.usageMetadata.promptTokenCount * input_tokens_price
@@ -272,6 +298,8 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
        for i in response.usageMetadata.candidatesTokensDetails:
            if i.modality == Modality.IMAGE:
                final_price += output_image_tokens_price * i.tokenCount  # for Nano Banana models
+            elif i.modality == Modality.VIDEO:
+                final_price += output_video_tokens_price * i.tokenCount  # for Omni Flash
            else:
                final_price += output_text_tokens_price * i.tokenCount
    if response.usageMetadata.thoughtsTokenCount:
@@ -1531,6 +1559,149 @@ class GeminiNanoBanana2V2(IO.ComfyNode):
        )


+OMNI_MAX_IMAGES = 14
+OMNI_MAX_VIDEOS = 3
+
+OMNI_MODELS: dict[str, str] = {
+    "Omni Flash": "gemini-omni-flash-preview",
+}
+
+
+def _omni_flash_inputs() -> list[Input]:
+    """Per-model inputs for the Omni video DynamicCombo (prompt + reference media + sampling)."""
+    return [
+        IO.String.Input(
+            "prompt",
+            multiline=True,
+            default="",
+            tooltip="Describe the video to generate. Specify the length and aspect ratio directly in the "
+            'prompt, e.g. "a 6-second clip in 16:9". Length may be 3-10 seconds; the aspect ratio must be '
+            "16:9 (landscape) or 9:16 (portrait). The output is 720p, 24 FPS, with audio.",
+        ),
+        IO.Autogrow.Input(
+            "images",
+            template=IO.Autogrow.TemplateNames(
+                IO.Image.Input("image"),
+                names=[f"image_{i}" for i in range(1, OMNI_MAX_IMAGES + 1)],
+                min=0,
+            ),
+            tooltip=f"Optional reference image(s) to guide or animate the video. Up to {OMNI_MAX_IMAGES} images.",
+        ),
+        IO.Autogrow.Input(
+            "videos",
+            template=IO.Autogrow.TemplateNames(
+                IO.Video.Input("video"),
+                names=[f"video_{i}" for i in range(1, OMNI_MAX_VIDEOS + 1)],
+                min=0,
+            ),
+            tooltip=f"Optional reference video(s) to guide or edit. Up to {OMNI_MAX_VIDEOS} videos, "
+            f"each up to 10 seconds long.",
+        ),
+        IO.Float.Input(
+            "temperature",
+            default=1.0,
+            min=0.0,
+            max=2.0,
+            step=0.01,
+            tooltip="Controls randomness. Lower is more focused/deterministic, higher is more varied.",
+            advanced=True,
+        ),
+        IO.Float.Input(
+            "top_p",
+            default=0.95,
+            min=0.0,
+            max=1.0,
+            step=0.01,
+            tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.",
+            advanced=True,
+        ),
+    ]
+
+
+class GeminiVideoOmni(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GeminiVideoOmni",
+            display_name="Google Gemini Omni (Video)",
+            category="partner/video/Gemini",
+            essentials_category="Video Generation",
+            description="Generate a video with audio from a text prompt using Google's Gemini Omni Flash model. "
+            "Optionally provide reference images and/or videos to guide or edit the result. Describe the desired "
+            "length (3-10s) and aspect ratio (16:9 or 9:16) directly in the prompt.",
+            inputs=[
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=[
+                        IO.DynamicCombo.Option("Omni Flash", _omni_flash_inputs()),
+                    ],
+                    tooltip="The Gemini video model used to generate the video.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=42,
+                    min=0,
+                    max=2147483647,
+                    control_after_generate=True,
+                    tooltip="Seed controls whether the node should re-run; "
+                    "results are non-deterministic regardless of seed.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+                IO.String.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr='{"type":"usd","usd":0.146,"format":{"suffix":"/second","approximate":true}}'
+            ),
+        )
+
+    @classmethod
+    async def execute(cls, model: dict, seed: int) -> IO.NodeOutput:
+        prompt = model.get("prompt") or ""
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        model_id = OMNI_MODELS[model["model"]]
+
+        images = [t for t in (model.get("images") or {}).values() if t is not None]
+        videos = [v for v in (model.get("videos") or {}).values() if v is not None]
+        if sum(get_number_of_images(t) for t in images) > OMNI_MAX_IMAGES:
+            raise ValueError(f"The current maximum number of supported images is {OMNI_MAX_IMAGES}.")
+        if len(videos) > OMNI_MAX_VIDEOS:
+            raise ValueError(f"The current maximum number of supported videos is {OMNI_MAX_VIDEOS}.")
+        for video in videos:
+            validate_video_duration(video, max_duration=10)
+
+        parts: list[GeminiPart] = []
+        if images or videos:
+            parts.extend(await build_gemini_media_parts(cls, images, [], videos))
+        parts.append(GeminiPart(text=prompt))
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"),
+            data=GeminiGenerateContentRequest(
+                contents=[GeminiContent(role=GeminiRole.user, parts=parts)],
+                generationConfig=GeminiGenerationConfig(
+                    responseModalities=["TEXT", "VIDEO"],
+                    temperature=model.get("temperature", 1.0),
+                    topP=model.get("top_p", 0.95),
+                ),
+            ),
+            response_model=GeminiGenerateContentResponse,
+            price_extractor=calculate_tokens_price,
+        )
+        return IO.NodeOutput(
+            await get_video_from_response(response, cls=cls),
+            get_text_from_response(response),
+        )
+
+
 class GeminiExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -1541,6 +1712,7 @@ class GeminiExtension(ComfyExtension):
            GeminiImage2,
            GeminiNanoBanana2,
            GeminiNanoBanana2V2,
+            GeminiVideoOmni,
            GeminiInputFiles,
        ]