[Partner Nodes] feat(Google): add Gemini Video Omni node (#14695)

This commit is contained in:
Alexander Piskun 2026-07-01 00:17:53 +03:00 committed by GitHub
parent 1c59659a2f
commit b70944e710
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 174 additions and 1 deletions

View File

@ -121,6 +121,7 @@ class GeminiGenerationConfig(BaseModel):
topK: int | None = Field(None, ge=1)
topP: float | None = Field(None, ge=0.0, le=1.0)
thinkingConfig: GeminiThinkingConfig | None = Field(None)
responseModalities: list[str] | None = Field(None)
class GeminiImageOutputOptions(BaseModel):

View File

@ -13,7 +13,7 @@ import torch
from typing_extensions import override
import folder_paths
from comfy_api.latest import IO, ComfyExtension, Input, Types
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types
from comfy_api_nodes.apis.gemini import (
GeminiContent,
GeminiFileData,
@ -37,6 +37,7 @@ from comfy_api_nodes.util import (
audio_to_base64_string,
bytesio_to_image_tensor,
download_url_to_image_tensor,
download_url_to_video_output,
get_number_of_images,
sync_op,
tensor_to_base64_string,
@ -45,6 +46,7 @@ from comfy_api_nodes.util import (
upload_images_to_comfyapi,
upload_video_to_comfyapi,
validate_string,
validate_video_duration,
video_to_base64_string,
)
@ -229,10 +231,29 @@ async def get_image_from_response(response: GeminiGenerateContentResponse, thoug
return torch.cat(image_tensors, dim=0)
async def get_video_from_response(
response: GeminiGenerateContentResponse, cls: type[IO.ComfyNode] | None = None
) -> InputImpl.VideoFromFile:
parts = get_parts_by_type(response, "video/*")
for part in parts:
if part.inlineData and part.inlineData.data:
return InputImpl.VideoFromFile(BytesIO(base64.b64decode(part.inlineData.data)))
if part.fileData and part.fileData.fileUri:
return await download_url_to_video_output(part.fileData.fileUri, cls=cls)
model_message = get_text_from_response(response).strip()
if model_message:
raise ValueError(f"Gemini did not generate a video. Model response: {model_message}")
raise ValueError(
"Gemini did not generate a video. Try rephrasing your prompt, "
"shortening the requested duration, or reducing the number of input images/videos."
)
def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
if not response.modelVersion:
return None
# Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
output_video_tokens_price = 0.0
if response.modelVersion == "gemini-2.5-pro":
input_tokens_price = 1.25
output_text_tokens_price = 10.0
@ -265,6 +286,11 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
input_tokens_price = 0.25
output_text_tokens_price = 1.50
output_image_tokens_price = 30.0
elif response.modelVersion == "gemini-omni-flash-preview":
input_tokens_price = 2.145
output_text_tokens_price = 12.87
output_image_tokens_price = 0.0
output_video_tokens_price = 25.025
else:
return None
final_price = response.usageMetadata.promptTokenCount * input_tokens_price
@ -272,6 +298,8 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
for i in response.usageMetadata.candidatesTokensDetails:
if i.modality == Modality.IMAGE:
final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models
elif i.modality == Modality.VIDEO:
final_price += output_video_tokens_price * i.tokenCount # for Omni Flash
else:
final_price += output_text_tokens_price * i.tokenCount
if response.usageMetadata.thoughtsTokenCount:
@ -1531,6 +1559,149 @@ class GeminiNanoBanana2V2(IO.ComfyNode):
)
OMNI_MAX_IMAGES = 14
OMNI_MAX_VIDEOS = 3
OMNI_MODELS: dict[str, str] = {
"Omni Flash": "gemini-omni-flash-preview",
}
def _omni_flash_inputs() -> list[Input]:
"""Per-model inputs for the Omni video DynamicCombo (prompt + reference media + sampling)."""
return [
IO.String.Input(
"prompt",
multiline=True,
default="",
tooltip="Describe the video to generate. Specify the length and aspect ratio directly in the "
'prompt, e.g. "a 6-second clip in 16:9". Length may be 3-10 seconds; the aspect ratio must be '
"16:9 (landscape) or 9:16 (portrait). The output is 720p, 24 FPS, with audio.",
),
IO.Autogrow.Input(
"images",
template=IO.Autogrow.TemplateNames(
IO.Image.Input("image"),
names=[f"image_{i}" for i in range(1, OMNI_MAX_IMAGES + 1)],
min=0,
),
tooltip=f"Optional reference image(s) to guide or animate the video. Up to {OMNI_MAX_IMAGES} images.",
),
IO.Autogrow.Input(
"videos",
template=IO.Autogrow.TemplateNames(
IO.Video.Input("video"),
names=[f"video_{i}" for i in range(1, OMNI_MAX_VIDEOS + 1)],
min=0,
),
tooltip=f"Optional reference video(s) to guide or edit. Up to {OMNI_MAX_VIDEOS} videos, "
f"each up to 10 seconds long.",
),
IO.Float.Input(
"temperature",
default=1.0,
min=0.0,
max=2.0,
step=0.01,
tooltip="Controls randomness. Lower is more focused/deterministic, higher is more varied.",
advanced=True,
),
IO.Float.Input(
"top_p",
default=0.95,
min=0.0,
max=1.0,
step=0.01,
tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.",
advanced=True,
),
]
class GeminiVideoOmni(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="GeminiVideoOmni",
display_name="Google Gemini Omni (Video)",
category="partner/video/Gemini",
essentials_category="Video Generation",
description="Generate a video with audio from a text prompt using Google's Gemini Omni Flash model. "
"Optionally provide reference images and/or videos to guide or edit the result. Describe the desired "
"length (3-10s) and aspect ratio (16:9 or 9:16) directly in the prompt.",
inputs=[
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option("Omni Flash", _omni_flash_inputs()),
],
tooltip="The Gemini video model used to generate the video.",
),
IO.Int.Input(
"seed",
default=42,
min=0,
max=2147483647,
control_after_generate=True,
tooltip="Seed controls whether the node should re-run; "
"results are non-deterministic regardless of seed.",
),
],
outputs=[
IO.Video.Output(),
IO.String.Output(),
],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr='{"type":"usd","usd":0.146,"format":{"suffix":"/second","approximate":true}}'
),
)
@classmethod
async def execute(cls, model: dict, seed: int) -> IO.NodeOutput:
prompt = model.get("prompt") or ""
validate_string(prompt, strip_whitespace=True, min_length=1)
model_id = OMNI_MODELS[model["model"]]
images = [t for t in (model.get("images") or {}).values() if t is not None]
videos = [v for v in (model.get("videos") or {}).values() if v is not None]
if sum(get_number_of_images(t) for t in images) > OMNI_MAX_IMAGES:
raise ValueError(f"The current maximum number of supported images is {OMNI_MAX_IMAGES}.")
if len(videos) > OMNI_MAX_VIDEOS:
raise ValueError(f"The current maximum number of supported videos is {OMNI_MAX_VIDEOS}.")
for video in videos:
validate_video_duration(video, max_duration=10)
parts: list[GeminiPart] = []
if images or videos:
parts.extend(await build_gemini_media_parts(cls, images, [], videos))
parts.append(GeminiPart(text=prompt))
response = await sync_op(
cls,
ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"),
data=GeminiGenerateContentRequest(
contents=[GeminiContent(role=GeminiRole.user, parts=parts)],
generationConfig=GeminiGenerationConfig(
responseModalities=["TEXT", "VIDEO"],
temperature=model.get("temperature", 1.0),
topP=model.get("top_p", 0.95),
),
),
response_model=GeminiGenerateContentResponse,
price_extractor=calculate_tokens_price,
)
return IO.NodeOutput(
await get_video_from_response(response, cls=cls),
get_text_from_response(response),
)
class GeminiExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@ -1541,6 +1712,7 @@ class GeminiExtension(ComfyExtension):
GeminiImage2,
GeminiNanoBanana2,
GeminiNanoBanana2V2,
GeminiVideoOmni,
GeminiInputFiles,
]