mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 21:20:49 +08:00
[Partner Nodes] feat(Google): add Gemini Video Omni node (#14695)
This commit is contained in:
parent
1c59659a2f
commit
b70944e710
@ -121,6 +121,7 @@ class GeminiGenerationConfig(BaseModel):
|
|||||||
topK: int | None = Field(None, ge=1)
|
topK: int | None = Field(None, ge=1)
|
||||||
topP: float | None = Field(None, ge=0.0, le=1.0)
|
topP: float | None = Field(None, ge=0.0, le=1.0)
|
||||||
thinkingConfig: GeminiThinkingConfig | None = Field(None)
|
thinkingConfig: GeminiThinkingConfig | None = Field(None)
|
||||||
|
responseModalities: list[str] | None = Field(None)
|
||||||
|
|
||||||
|
|
||||||
class GeminiImageOutputOptions(BaseModel):
|
class GeminiImageOutputOptions(BaseModel):
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import torch
|
|||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
from comfy_api.latest import IO, ComfyExtension, Input, Types
|
from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types
|
||||||
from comfy_api_nodes.apis.gemini import (
|
from comfy_api_nodes.apis.gemini import (
|
||||||
GeminiContent,
|
GeminiContent,
|
||||||
GeminiFileData,
|
GeminiFileData,
|
||||||
@ -37,6 +37,7 @@ from comfy_api_nodes.util import (
|
|||||||
audio_to_base64_string,
|
audio_to_base64_string,
|
||||||
bytesio_to_image_tensor,
|
bytesio_to_image_tensor,
|
||||||
download_url_to_image_tensor,
|
download_url_to_image_tensor,
|
||||||
|
download_url_to_video_output,
|
||||||
get_number_of_images,
|
get_number_of_images,
|
||||||
sync_op,
|
sync_op,
|
||||||
tensor_to_base64_string,
|
tensor_to_base64_string,
|
||||||
@ -45,6 +46,7 @@ from comfy_api_nodes.util import (
|
|||||||
upload_images_to_comfyapi,
|
upload_images_to_comfyapi,
|
||||||
upload_video_to_comfyapi,
|
upload_video_to_comfyapi,
|
||||||
validate_string,
|
validate_string,
|
||||||
|
validate_video_duration,
|
||||||
video_to_base64_string,
|
video_to_base64_string,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -229,10 +231,29 @@ async def get_image_from_response(response: GeminiGenerateContentResponse, thoug
|
|||||||
return torch.cat(image_tensors, dim=0)
|
return torch.cat(image_tensors, dim=0)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_video_from_response(
|
||||||
|
response: GeminiGenerateContentResponse, cls: type[IO.ComfyNode] | None = None
|
||||||
|
) -> InputImpl.VideoFromFile:
|
||||||
|
parts = get_parts_by_type(response, "video/*")
|
||||||
|
for part in parts:
|
||||||
|
if part.inlineData and part.inlineData.data:
|
||||||
|
return InputImpl.VideoFromFile(BytesIO(base64.b64decode(part.inlineData.data)))
|
||||||
|
if part.fileData and part.fileData.fileUri:
|
||||||
|
return await download_url_to_video_output(part.fileData.fileUri, cls=cls)
|
||||||
|
model_message = get_text_from_response(response).strip()
|
||||||
|
if model_message:
|
||||||
|
raise ValueError(f"Gemini did not generate a video. Model response: {model_message}")
|
||||||
|
raise ValueError(
|
||||||
|
"Gemini did not generate a video. Try rephrasing your prompt, "
|
||||||
|
"shortening the requested duration, or reducing the number of input images/videos."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
|
def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
|
||||||
if not response.modelVersion:
|
if not response.modelVersion:
|
||||||
return None
|
return None
|
||||||
# Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
|
# Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
|
||||||
|
output_video_tokens_price = 0.0
|
||||||
if response.modelVersion == "gemini-2.5-pro":
|
if response.modelVersion == "gemini-2.5-pro":
|
||||||
input_tokens_price = 1.25
|
input_tokens_price = 1.25
|
||||||
output_text_tokens_price = 10.0
|
output_text_tokens_price = 10.0
|
||||||
@ -265,6 +286,11 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
|
|||||||
input_tokens_price = 0.25
|
input_tokens_price = 0.25
|
||||||
output_text_tokens_price = 1.50
|
output_text_tokens_price = 1.50
|
||||||
output_image_tokens_price = 30.0
|
output_image_tokens_price = 30.0
|
||||||
|
elif response.modelVersion == "gemini-omni-flash-preview":
|
||||||
|
input_tokens_price = 2.145
|
||||||
|
output_text_tokens_price = 12.87
|
||||||
|
output_image_tokens_price = 0.0
|
||||||
|
output_video_tokens_price = 25.025
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
final_price = response.usageMetadata.promptTokenCount * input_tokens_price
|
final_price = response.usageMetadata.promptTokenCount * input_tokens_price
|
||||||
@ -272,6 +298,8 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N
|
|||||||
for i in response.usageMetadata.candidatesTokensDetails:
|
for i in response.usageMetadata.candidatesTokensDetails:
|
||||||
if i.modality == Modality.IMAGE:
|
if i.modality == Modality.IMAGE:
|
||||||
final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models
|
final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models
|
||||||
|
elif i.modality == Modality.VIDEO:
|
||||||
|
final_price += output_video_tokens_price * i.tokenCount # for Omni Flash
|
||||||
else:
|
else:
|
||||||
final_price += output_text_tokens_price * i.tokenCount
|
final_price += output_text_tokens_price * i.tokenCount
|
||||||
if response.usageMetadata.thoughtsTokenCount:
|
if response.usageMetadata.thoughtsTokenCount:
|
||||||
@ -1531,6 +1559,149 @@ class GeminiNanoBanana2V2(IO.ComfyNode):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
OMNI_MAX_IMAGES = 14
|
||||||
|
OMNI_MAX_VIDEOS = 3
|
||||||
|
|
||||||
|
OMNI_MODELS: dict[str, str] = {
|
||||||
|
"Omni Flash": "gemini-omni-flash-preview",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _omni_flash_inputs() -> list[Input]:
|
||||||
|
"""Per-model inputs for the Omni video DynamicCombo (prompt + reference media + sampling)."""
|
||||||
|
return [
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
default="",
|
||||||
|
tooltip="Describe the video to generate. Specify the length and aspect ratio directly in the "
|
||||||
|
'prompt, e.g. "a 6-second clip in 16:9". Length may be 3-10 seconds; the aspect ratio must be '
|
||||||
|
"16:9 (landscape) or 9:16 (portrait). The output is 720p, 24 FPS, with audio.",
|
||||||
|
),
|
||||||
|
IO.Autogrow.Input(
|
||||||
|
"images",
|
||||||
|
template=IO.Autogrow.TemplateNames(
|
||||||
|
IO.Image.Input("image"),
|
||||||
|
names=[f"image_{i}" for i in range(1, OMNI_MAX_IMAGES + 1)],
|
||||||
|
min=0,
|
||||||
|
),
|
||||||
|
tooltip=f"Optional reference image(s) to guide or animate the video. Up to {OMNI_MAX_IMAGES} images.",
|
||||||
|
),
|
||||||
|
IO.Autogrow.Input(
|
||||||
|
"videos",
|
||||||
|
template=IO.Autogrow.TemplateNames(
|
||||||
|
IO.Video.Input("video"),
|
||||||
|
names=[f"video_{i}" for i in range(1, OMNI_MAX_VIDEOS + 1)],
|
||||||
|
min=0,
|
||||||
|
),
|
||||||
|
tooltip=f"Optional reference video(s) to guide or edit. Up to {OMNI_MAX_VIDEOS} videos, "
|
||||||
|
f"each up to 10 seconds long.",
|
||||||
|
),
|
||||||
|
IO.Float.Input(
|
||||||
|
"temperature",
|
||||||
|
default=1.0,
|
||||||
|
min=0.0,
|
||||||
|
max=2.0,
|
||||||
|
step=0.01,
|
||||||
|
tooltip="Controls randomness. Lower is more focused/deterministic, higher is more varied.",
|
||||||
|
advanced=True,
|
||||||
|
),
|
||||||
|
IO.Float.Input(
|
||||||
|
"top_p",
|
||||||
|
default=0.95,
|
||||||
|
min=0.0,
|
||||||
|
max=1.0,
|
||||||
|
step=0.01,
|
||||||
|
tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.",
|
||||||
|
advanced=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class GeminiVideoOmni(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="GeminiVideoOmni",
|
||||||
|
display_name="Google Gemini Omni (Video)",
|
||||||
|
category="partner/video/Gemini",
|
||||||
|
essentials_category="Video Generation",
|
||||||
|
description="Generate a video with audio from a text prompt using Google's Gemini Omni Flash model. "
|
||||||
|
"Optionally provide reference images and/or videos to guide or edit the result. Describe the desired "
|
||||||
|
"length (3-10s) and aspect ratio (16:9 or 9:16) directly in the prompt.",
|
||||||
|
inputs=[
|
||||||
|
IO.DynamicCombo.Input(
|
||||||
|
"model",
|
||||||
|
options=[
|
||||||
|
IO.DynamicCombo.Option("Omni Flash", _omni_flash_inputs()),
|
||||||
|
],
|
||||||
|
tooltip="The Gemini video model used to generate the video.",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"seed",
|
||||||
|
default=42,
|
||||||
|
min=0,
|
||||||
|
max=2147483647,
|
||||||
|
control_after_generate=True,
|
||||||
|
tooltip="Seed controls whether the node should re-run; "
|
||||||
|
"results are non-deterministic regardless of seed.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
IO.String.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
price_badge=IO.PriceBadge(
|
||||||
|
expr='{"type":"usd","usd":0.146,"format":{"suffix":"/second","approximate":true}}'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(cls, model: dict, seed: int) -> IO.NodeOutput:
|
||||||
|
prompt = model.get("prompt") or ""
|
||||||
|
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||||
|
model_id = OMNI_MODELS[model["model"]]
|
||||||
|
|
||||||
|
images = [t for t in (model.get("images") or {}).values() if t is not None]
|
||||||
|
videos = [v for v in (model.get("videos") or {}).values() if v is not None]
|
||||||
|
if sum(get_number_of_images(t) for t in images) > OMNI_MAX_IMAGES:
|
||||||
|
raise ValueError(f"The current maximum number of supported images is {OMNI_MAX_IMAGES}.")
|
||||||
|
if len(videos) > OMNI_MAX_VIDEOS:
|
||||||
|
raise ValueError(f"The current maximum number of supported videos is {OMNI_MAX_VIDEOS}.")
|
||||||
|
for video in videos:
|
||||||
|
validate_video_duration(video, max_duration=10)
|
||||||
|
|
||||||
|
parts: list[GeminiPart] = []
|
||||||
|
if images or videos:
|
||||||
|
parts.extend(await build_gemini_media_parts(cls, images, [], videos))
|
||||||
|
parts.append(GeminiPart(text=prompt))
|
||||||
|
response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"),
|
||||||
|
data=GeminiGenerateContentRequest(
|
||||||
|
contents=[GeminiContent(role=GeminiRole.user, parts=parts)],
|
||||||
|
generationConfig=GeminiGenerationConfig(
|
||||||
|
responseModalities=["TEXT", "VIDEO"],
|
||||||
|
temperature=model.get("temperature", 1.0),
|
||||||
|
topP=model.get("top_p", 0.95),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
response_model=GeminiGenerateContentResponse,
|
||||||
|
price_extractor=calculate_tokens_price,
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(
|
||||||
|
await get_video_from_response(response, cls=cls),
|
||||||
|
get_text_from_response(response),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GeminiExtension(ComfyExtension):
|
class GeminiExtension(ComfyExtension):
|
||||||
@override
|
@override
|
||||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||||
@ -1541,6 +1712,7 @@ class GeminiExtension(ComfyExtension):
|
|||||||
GeminiImage2,
|
GeminiImage2,
|
||||||
GeminiNanoBanana2,
|
GeminiNanoBanana2,
|
||||||
GeminiNanoBanana2V2,
|
GeminiNanoBanana2V2,
|
||||||
|
GeminiVideoOmni,
|
||||||
GeminiInputFiles,
|
GeminiInputFiles,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user