diff --git a/comfy_api_nodes/apis/gemini.py b/comfy_api_nodes/apis/gemini.py index caaba8f36..7b2543270 100644 --- a/comfy_api_nodes/apis/gemini.py +++ b/comfy_api_nodes/apis/gemini.py @@ -121,6 +121,7 @@ class GeminiGenerationConfig(BaseModel): topK: int | None = Field(None, ge=1) topP: float | None = Field(None, ge=0.0, le=1.0) thinkingConfig: GeminiThinkingConfig | None = Field(None) + responseModalities: list[str] | None = Field(None) class GeminiImageOutputOptions(BaseModel): diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 1a8aadfd6..aa992802d 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -13,7 +13,7 @@ import torch from typing_extensions import override import folder_paths -from comfy_api.latest import IO, ComfyExtension, Input, Types +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types from comfy_api_nodes.apis.gemini import ( GeminiContent, GeminiFileData, @@ -37,6 +37,7 @@ from comfy_api_nodes.util import ( audio_to_base64_string, bytesio_to_image_tensor, download_url_to_image_tensor, + download_url_to_video_output, get_number_of_images, sync_op, tensor_to_base64_string, @@ -45,6 +46,7 @@ from comfy_api_nodes.util import ( upload_images_to_comfyapi, upload_video_to_comfyapi, validate_string, + validate_video_duration, video_to_base64_string, ) @@ -229,10 +231,29 @@ async def get_image_from_response(response: GeminiGenerateContentResponse, thoug return torch.cat(image_tensors, dim=0) +async def get_video_from_response( + response: GeminiGenerateContentResponse, cls: type[IO.ComfyNode] | None = None +) -> InputImpl.VideoFromFile: + parts = get_parts_by_type(response, "video/*") + for part in parts: + if part.inlineData and part.inlineData.data: + return InputImpl.VideoFromFile(BytesIO(base64.b64decode(part.inlineData.data))) + if part.fileData and part.fileData.fileUri: + return await download_url_to_video_output(part.fileData.fileUri, cls=cls) + model_message = get_text_from_response(response).strip() + if model_message: + raise ValueError(f"Gemini did not generate a video. Model response: {model_message}") + raise ValueError( + "Gemini did not generate a video. Try rephrasing your prompt, " + "shortening the requested duration, or reducing the number of input images/videos." + ) + + def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None: if not response.modelVersion: return None # Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing + output_video_tokens_price = 0.0 if response.modelVersion == "gemini-2.5-pro": input_tokens_price = 1.25 output_text_tokens_price = 10.0 @@ -265,6 +286,11 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N input_tokens_price = 0.25 output_text_tokens_price = 1.50 output_image_tokens_price = 30.0 + elif response.modelVersion == "gemini-omni-flash-preview": + input_tokens_price = 2.145 + output_text_tokens_price = 12.87 + output_image_tokens_price = 0.0 + output_video_tokens_price = 25.025 else: return None final_price = response.usageMetadata.promptTokenCount * input_tokens_price @@ -272,6 +298,8 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N for i in response.usageMetadata.candidatesTokensDetails: if i.modality == Modality.IMAGE: final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models + elif i.modality == Modality.VIDEO: + final_price += output_video_tokens_price * i.tokenCount # for Omni Flash else: final_price += output_text_tokens_price * i.tokenCount if response.usageMetadata.thoughtsTokenCount: @@ -1531,6 +1559,149 @@ class GeminiNanoBanana2V2(IO.ComfyNode): ) +OMNI_MAX_IMAGES = 14 +OMNI_MAX_VIDEOS = 3 + +OMNI_MODELS: dict[str, str] = { + "Omni Flash": "gemini-omni-flash-preview", +} + + +def _omni_flash_inputs() -> list[Input]: + """Per-model inputs for the Omni video DynamicCombo (prompt + reference media + sampling).""" + return [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Describe the video to generate. Specify the length and aspect ratio directly in the " + 'prompt, e.g. "a 6-second clip in 16:9". Length may be 3-10 seconds; the aspect ratio must be ' + "16:9 (landscape) or 9:16 (portrait). The output is 720p, 24 FPS, with audio.", + ), + IO.Autogrow.Input( + "images", + template=IO.Autogrow.TemplateNames( + IO.Image.Input("image"), + names=[f"image_{i}" for i in range(1, OMNI_MAX_IMAGES + 1)], + min=0, + ), + tooltip=f"Optional reference image(s) to guide or animate the video. Up to {OMNI_MAX_IMAGES} images.", + ), + IO.Autogrow.Input( + "videos", + template=IO.Autogrow.TemplateNames( + IO.Video.Input("video"), + names=[f"video_{i}" for i in range(1, OMNI_MAX_VIDEOS + 1)], + min=0, + ), + tooltip=f"Optional reference video(s) to guide or edit. Up to {OMNI_MAX_VIDEOS} videos, " + f"each up to 10 seconds long.", + ), + IO.Float.Input( + "temperature", + default=1.0, + min=0.0, + max=2.0, + step=0.01, + tooltip="Controls randomness. Lower is more focused/deterministic, higher is more varied.", + advanced=True, + ), + IO.Float.Input( + "top_p", + default=0.95, + min=0.0, + max=1.0, + step=0.01, + tooltip="Nucleus sampling: sample from the smallest token set whose cumulative probability reaches top_p.", + advanced=True, + ), + ] + + +class GeminiVideoOmni(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GeminiVideoOmni", + display_name="Google Gemini Omni (Video)", + category="partner/video/Gemini", + essentials_category="Video Generation", + description="Generate a video with audio from a text prompt using Google's Gemini Omni Flash model. " + "Optionally provide reference images and/or videos to guide or edit the result. Describe the desired " + "length (3-10s) and aspect ratio (16:9 or 9:16) directly in the prompt.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option("Omni Flash", _omni_flash_inputs()), + ], + tooltip="The Gemini video model used to generate the video.", + ), + IO.Int.Input( + "seed", + default=42, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed controls whether the node should re-run; " + "results are non-deterministic regardless of seed.", + ), + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + expr='{"type":"usd","usd":0.146,"format":{"suffix":"/second","approximate":true}}' + ), + ) + + @classmethod + async def execute(cls, model: dict, seed: int) -> IO.NodeOutput: + prompt = model.get("prompt") or "" + validate_string(prompt, strip_whitespace=True, min_length=1) + model_id = OMNI_MODELS[model["model"]] + + images = [t for t in (model.get("images") or {}).values() if t is not None] + videos = [v for v in (model.get("videos") or {}).values() if v is not None] + if sum(get_number_of_images(t) for t in images) > OMNI_MAX_IMAGES: + raise ValueError(f"The current maximum number of supported images is {OMNI_MAX_IMAGES}.") + if len(videos) > OMNI_MAX_VIDEOS: + raise ValueError(f"The current maximum number of supported videos is {OMNI_MAX_VIDEOS}.") + for video in videos: + validate_video_duration(video, max_duration=10) + + parts: list[GeminiPart] = [] + if images or videos: + parts.extend(await build_gemini_media_parts(cls, images, [], videos)) + parts.append(GeminiPart(text=prompt)) + response = await sync_op( + cls, + ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model_id}", method="POST"), + data=GeminiGenerateContentRequest( + contents=[GeminiContent(role=GeminiRole.user, parts=parts)], + generationConfig=GeminiGenerationConfig( + responseModalities=["TEXT", "VIDEO"], + temperature=model.get("temperature", 1.0), + topP=model.get("top_p", 0.95), + ), + ), + response_model=GeminiGenerateContentResponse, + price_extractor=calculate_tokens_price, + ) + return IO.NodeOutput( + await get_video_from_response(response, cls=cls), + get_text_from_response(response), + ) + + class GeminiExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -1541,6 +1712,7 @@ class GeminiExtension(ComfyExtension): GeminiImage2, GeminiNanoBanana2, GeminiNanoBanana2V2, + GeminiVideoOmni, GeminiInputFiles, ] diff --git a/comfy_extras/nodes_cond.py b/comfy_extras/nodes_cond.py index b745a43af..c8091b7a4 100644 --- a/comfy_extras/nodes_cond.py +++ b/comfy_extras/nodes_cond.py @@ -8,7 +8,8 @@ class CLIPTextEncodeControlnet(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CLIPTextEncodeControlnet", - category="experimental/conditioning", + display_name="CLIP Text Encode (Controlnet)", + category="model/conditioning", inputs=[ io.Clip.Input("clip"), io.Conditioning.Input("conditioning"), @@ -35,11 +36,12 @@ class T5TokenizerOptions(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="T5TokenizerOptions", - category="experimental/conditioning", + display_name="T5 Tokenizer Options", + category="model/conditioning", inputs=[ io.Clip.Input("clip"), - io.Int.Input("min_padding", default=0, min=0, max=10000, step=1, advanced=True), - io.Int.Input("min_length", default=0, min=0, max=10000, step=1, advanced=True), + io.Int.Input("min_padding", default=0, min=0, max=10000, step=1), + io.Int.Input("min_length", default=0, min=0, max=10000, step=1), ], outputs=[io.Clip.Output()], is_experimental=True, diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index c9d7e06fc..56ef5f526 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -1070,7 +1070,7 @@ class AddNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="AddNoise", - category="experimental/custom_sampling/noise", + category="model/sampling/noise", is_experimental=True, inputs=[ io.Model.Input("model"), @@ -1120,7 +1120,7 @@ class ManualSigmas(io.ComfyNode): return io.Schema( node_id="ManualSigmas", search_aliases=["custom noise schedule", "define sigmas"], - category="experimental/custom_sampling", + category="model/sampling/sigmas", is_experimental=True, inputs=[ io.String.Input("sigmas", default="1, 0.5", multiline=False) diff --git a/comfy_extras/nodes_photomaker.py b/comfy_extras/nodes_photomaker.py index 8a2248572..72fad1673 100644 --- a/comfy_extras/nodes_photomaker.py +++ b/comfy_extras/nodes_photomaker.py @@ -123,7 +123,8 @@ class PhotoMakerLoader(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PhotoMakerLoader", - category="experimental/photomaker", + display_name="Load PhotoMaker Model", + category="model/loaders", inputs=[ io.Combo.Input("photomaker_model_name", options=folder_paths.get_filename_list("photomaker")), ], @@ -149,7 +150,8 @@ class PhotoMakerEncode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PhotoMakerEncode", - category="experimental/photomaker", + display_name="PhotoMaker Encode", + category="model/conditioning/photomaker", inputs=[ io.Photomaker.Input("photomaker"), io.Image.Input("image"), diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py index 6a78ffb47..ddfb4f2b0 100644 --- a/comfy_extras/nodes_stable_cascade.py +++ b/comfy_extras/nodes_stable_cascade.py @@ -119,7 +119,7 @@ class StableCascade_SuperResolutionControlnet(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_SuperResolutionControlnet", - category="experimental/stable_cascade", + category="experimental/stable cascade", is_experimental=True, inputs=[ io.Image.Input("image"), diff --git a/comfy_extras/nodes_triposplat.py b/comfy_extras/nodes_triposplat.py index 7bf4703fe..c892213e4 100644 --- a/comfy_extras/nodes_triposplat.py +++ b/comfy_extras/nodes_triposplat.py @@ -143,7 +143,7 @@ class VAEDecodeTripoSplat(IO.ComfyNode): return IO.Schema( node_id="VAEDecodeTripoSplat", display_name="TripoSplat Decode", - category="3d/latent", + category="model/latent/triposplat", description="Decode the sampled TripoSplat latent into a 3D gaussian splat. " "Modify the number of gaussians to vary the density.", inputs=[ @@ -188,7 +188,7 @@ class TripoSplatSamplingPreview(IO.ComfyNode): return IO.Schema( node_id="TripoSplatSamplingPreview", display_name="TripoSplat Sampling Preview", - category="3d/latent", + category="model/latent/triposplat", description="Patch the TripoSplat model for the standard Ksampler node to show a live decoded " "gaussian splat preview at each step.", inputs=[ diff --git a/nodes.py b/nodes.py index 77c577b9a..9043a8d0a 100644 --- a/nodes.py +++ b/nodes.py @@ -349,7 +349,7 @@ class VAEDecodeTiled: RETURN_TYPES = ("IMAGE",) FUNCTION = "decode" - CATEGORY = "experimental" + CATEGORY = "model/latent" def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8): if tile_size < overlap * 4: @@ -396,7 +396,7 @@ class VAEEncodeTiled: RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "experimental" + CATEGORY = "model/latent" def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8): t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap) @@ -514,7 +514,7 @@ class SaveLatent: OUTPUT_NODE = True - CATEGORY = "experimental" + CATEGORY = "model/latent" def save(self, samples, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None): full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir) @@ -559,7 +559,7 @@ class LoadLatent: files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and f.endswith(".latent")] return {"required": {"latent": [sorted(files), ]}, } - CATEGORY = "experimental" + CATEGORY = "model/latent" RETURN_TYPES = ("LATENT", ) FUNCTION = "load" @@ -2155,6 +2155,8 @@ NODE_DISPLAY_NAME_MAPPINGS = { "GLIGENTextBoxApply": "Apply GLIGEN Text Box", "ConditioningZeroOut": "Conditioning Zero Out", # Latent + "LoadLatent": "Load Latent", + "SaveLatent": "Save Latent", "VAEEncodeForInpaint": "VAE Encode (for Inpainting)", "SetLatentNoiseMask": "Set Latent Noise Mask", "VAEDecode": "VAE Decode", @@ -2189,7 +2191,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "ImageSharpen": "Sharpen Image", "ImageScaleToTotalPixels": "Scale Image to Total Pixels", "GetImageSize": "Get Image Size", - # experimental "VAEDecodeTiled": "VAE Decode (Tiled)", "VAEEncodeTiled": "VAE Encode (Tiled)", } diff --git a/requirements.txt b/requirements.txt index bb11b9605..1d9fe4137 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.45.20 -comfyui-workflow-templates==0.10.7 +comfyui-workflow-templates==0.11.1 comfyui-embedded-docs==0.5.6 torch torchsde