diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 9d90d5a61..4c976058f 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -27,6 +27,7 @@ class Llama2Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = False + rope_dims = None @dataclass class Qwen25_3BConfig: @@ -44,6 +45,7 @@ class Qwen25_3BConfig: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = None @dataclass class Qwen25_7BVLI_Config: @@ -61,6 +63,7 @@ class Qwen25_7BVLI_Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = [16, 24, 24] @dataclass class Gemma2_2B_Config: @@ -78,6 +81,7 @@ class Gemma2_2B_Config: rms_norm_add = True mlp_activation = "gelu_pytorch_tanh" qkv_bias = False + rope_dims = None class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -102,7 +106,7 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, position_ids, theta, device=None): +def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): theta_numerator = torch.arange(0, head_dim, 2, device=device).float() inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) @@ -112,12 +116,20 @@ def precompute_freqs_cis(head_dim, position_ids, theta, device=None): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + if rope_dims is not None and position_ids.shape[0] > 1: + mrope_section = rope_dims * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + else: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + return (cos, sin) def apply_rope(xq, xk, freqs_cis): - cos = freqs_cis[0].unsqueeze(1) - sin = freqs_cis[1].unsqueeze(1) + cos = freqs_cis[0] + sin = freqs_cis[1] q_embed = (xq * cos) + (rotate_half(xq) * sin) k_embed = (xk * cos) + (rotate_half(xk) * sin) return q_embed, k_embed @@ -292,6 +304,7 @@ class Llama2_(nn.Module): freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, + self.config.rope_dims, device=x.device) mask = None diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 54298e8a9..7a09df55b 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1315,6 +1315,7 @@ class KlingTaskStatus(str, Enum): class KlingTextToVideoModelName(str, Enum): kling_v1 = 'kling-v1' kling_v1_6 = 'kling-v1-6' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoGenAspectRatio(str, Enum): @@ -1347,6 +1348,8 @@ class KlingVideoGenModelName(str, Enum): kling_v1_5 = 'kling-v1-5' kling_v1_6 = 'kling-v1-6' kling_v2_master = 'kling-v2-master' + kling_v2_1 = 'kling-v2-1' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoResult(BaseModel): @@ -1620,13 +1623,14 @@ class MinimaxTaskResultResponse(BaseModel): task_id: str = Field(..., description='The task ID being queried.') -class Model(str, Enum): +class MiniMaxModel(str, Enum): T2V_01_Director = 'T2V-01-Director' I2V_01_Director = 'I2V-01-Director' S2V_01 = 'S2V-01' I2V_01 = 'I2V-01' I2V_01_live = 'I2V-01-live' T2V_01 = 'T2V-01' + Hailuo_02 = 'MiniMax-Hailuo-02' class SubjectReferenceItem(BaseModel): @@ -1648,7 +1652,7 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.', ) - model: Model = Field( + model: MiniMaxModel = Field( ..., description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01', ) @@ -1665,6 +1669,14 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.', ) + duration: Optional[int] = Field( + None, + description="The length of the output video in seconds." + ) + resolution: Optional[str] = Field( + None, + description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels." + ) class MinimaxVideoGenerationResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 3751fb2a1..ba4167a50 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -46,6 +46,8 @@ class GeminiModel(str, Enum): gemini_2_5_pro_preview_05_06 = "gemini-2.5-pro-preview-05-06" gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17" + gemini_2_5_pro = "gemini-2.5-pro" + gemini_2_5_flash = "gemini-2.5-flash" def get_gemini_endpoint( @@ -97,7 +99,7 @@ class GeminiNode(ComfyNodeABC): { "tooltip": "The Gemini model to use for generating responses.", "options": [model.value for model in GeminiModel], - "default": GeminiModel.gemini_2_5_pro_preview_05_06.value, + "default": GeminiModel.gemini_2_5_pro.value, }, ), "seed": ( diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9d483bb0e..9fa390985 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -421,6 +421,8 @@ class KlingTextToVideoNode(KlingNodeBase): "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), + "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), + "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), } @classmethod diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 58d2ed90c..bb3c9e710 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,3 +1,4 @@ +from inspect import cleandoc from typing import Union import logging import torch @@ -10,7 +11,7 @@ from comfy_api_nodes.apis import ( MinimaxFileRetrieveResponse, MinimaxTaskResultResponse, SubjectReferenceItem, - Model + MiniMaxModel ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -84,7 +85,6 @@ class MinimaxTextToVideoNode: FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True async def generate_video( self, @@ -121,7 +121,7 @@ class MinimaxTextToVideoNode: response_model=MinimaxVideoGenerationResponse, ), request=MinimaxVideoGenerationRequest( - model=Model(model), + model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, first_frame_image=image_url, @@ -251,7 +251,6 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): @@ -313,7 +312,181 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True + + +class MinimaxHailuoVideoNode: + """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "prompt_text": ( + "STRING", + { + "multiline": True, + "default": "", + "tooltip": "Text prompt to guide the video generation.", + }, + ), + }, + "optional": { + "seed": ( + IO.INT, + { + "default": 0, + "min": 0, + "max": 0xFFFFFFFFFFFFFFFF, + "control_after_generate": True, + "tooltip": "The random seed used for creating the noise.", + }, + ), + "first_frame_image": ( + IO.IMAGE, + { + "tooltip": "Optional image to use as the first frame to generate a video." + }, + ), + "prompt_optimizer": ( + IO.BOOLEAN, + { + "tooltip": "Optimize prompt to improve generation quality when needed.", + "default": True, + }, + ), + "duration": ( + IO.COMBO, + { + "tooltip": "The length of the output video in seconds.", + "default": 6, + "options": [6, 10], + }, + ), + "resolution": ( + IO.COMBO, + { + "tooltip": "The dimensions of the video display. " + "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.", + "default": "768P", + "options": ["768P", "1080P"], + }, + ), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = ("VIDEO",) + DESCRIPTION = cleandoc(__doc__ or "") + FUNCTION = "generate_video" + CATEGORY = "api node/video/MiniMax" + API_NODE = True + + async def generate_video( + self, + prompt_text, + seed=0, + first_frame_image: torch.Tensor=None, # used for ImageToVideo + prompt_optimizer=True, + duration=6, + resolution="768P", + model="MiniMax-Hailuo-02", + unique_id: Union[str, None]=None, + **kwargs, + ): + if first_frame_image is None: + validate_string(prompt_text, field_name="prompt_text") + + if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6: + raise Exception( + "When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds." + ) + + # upload image, if passed in + image_url = None + if first_frame_image is not None: + image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0] + + video_generate_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/video_generation", + method=HttpMethod.POST, + request_model=MinimaxVideoGenerationRequest, + response_model=MinimaxVideoGenerationResponse, + ), + request=MinimaxVideoGenerationRequest( + model=MiniMaxModel(model), + prompt=prompt_text, + callback_url=None, + first_frame_image=image_url, + prompt_optimizer=prompt_optimizer, + duration=duration, + resolution=resolution, + ), + auth_kwargs=kwargs, + ) + response = await video_generate_operation.execute() + + task_id = response.task_id + if not task_id: + raise Exception(f"MiniMax generation failed: {response.base_resp}") + + average_duration = 120 if resolution == "768P" else 240 + video_generate_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path="/proxy/minimax/query/video_generation", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxTaskResultResponse, + query_params={"task_id": task_id}, + ), + completed_statuses=["Success"], + failed_statuses=["Fail"], + status_extractor=lambda x: x.status.value, + estimated_duration=average_duration, + node_id=unique_id, + auth_kwargs=kwargs, + ) + task_result = await video_generate_operation.execute() + + file_id = task_result.file_id + if file_id is None: + raise Exception("Request was not successful. Missing file ID.") + file_retrieve_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/files/retrieve", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxFileRetrieveResponse, + query_params={"file_id": int(file_id)}, + ), + request=EmptyRequest(), + auth_kwargs=kwargs, + ) + file_result = await file_retrieve_operation.execute() + + file_url = file_result.file.download_url + if file_url is None: + raise Exception( + f"No video was found in the response. Full response: {file_result.model_dump()}" + ) + logging.info(f"Generated video URL: {file_url}") + if unique_id: + if hasattr(file_result.file, "backup_download_url"): + message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" + else: + message = f"Result URL: {file_url}" + PromptServer.instance.send_progress_text(message, unique_id) + + video_io = await download_url_to_bytesio(file_url) + if video_io is None: + error_msg = f"Failed to download video from {file_url}" + logging.error(error_msg) + raise Exception(error_msg) + return (VideoFromFile(video_io),) # A dictionary that contains all nodes you want to export with their names @@ -322,6 +495,7 @@ NODE_CLASS_MAPPINGS = { "MinimaxTextToVideoNode": MinimaxTextToVideoNode, "MinimaxImageToVideoNode": MinimaxImageToVideoNode, # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode, + "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode, } # A dictionary that contains the friendly/humanly readable titles for the nodes @@ -329,4 +503,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "MinimaxTextToVideoNode": "MiniMax Text to Video", "MinimaxImageToVideoNode": "MiniMax Image to Video", "MinimaxSubjectToVideoNode": "MiniMax Subject to Video", + "MinimaxHailuoVideoNode": "MiniMax Hailuo Video", } diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index cbff2b2d2..674c9ede0 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -80,6 +80,9 @@ class SupportedOpenAIModel(str, Enum): gpt_4_1 = "gpt-4.1" gpt_4_1_mini = "gpt-4.1-mini" gpt_4_1_nano = "gpt-4.1-nano" + gpt_5 = "gpt-5" + gpt_5_mini = "gpt-5-mini" + gpt_5_nano = "gpt-5-nano" class OpenAIDalle2(ComfyNodeABC): diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py new file mode 100644 index 000000000..2f441948c --- /dev/null +++ b/comfy_api_nodes/nodes_vidu.py @@ -0,0 +1,622 @@ +import logging +from enum import Enum +from typing import Any, Callable, Optional, Literal, TypeVar +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import ( + validate_aspect_ratio_closeness, + validate_image_dimensions, + validate_image_aspect_ratio_range, + get_number_of_images, +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi + + +VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" +VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video" +VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video" +VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video" +VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" + +R = TypeVar("R") + +class VideoModelName(str, Enum): + vidu_q1 = 'viduq1' + + +class AspectRatio(str, Enum): + r_16_9 = "16:9" + r_9_16 = "9:16" + r_1_1 = "1:1" + + +class Resolution(str, Enum): + r_1080p = "1080p" + + +class MovementAmplitude(str, Enum): + auto = "auto" + small = "small" + medium = "medium" + large = "large" + + +class TaskCreationRequest(BaseModel): + model: VideoModelName = VideoModelName.vidu_q1 + prompt: Optional[str] = Field(None, max_length=1500) + duration: Optional[Literal[5]] = 5 + seed: Optional[int] = Field(0, ge=0, le=2147483647) + aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9 + resolution: Optional[Resolution] = Resolution.r_1080p + movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto + images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL") + + +class TaskStatus(str, Enum): + created = "created" + queueing = "queueing" + processing = "processing" + success = "success" + failed = "failed" + + +class TaskCreationResponse(BaseModel): + task_id: str = Field(...) + state: TaskStatus = Field(...) + created_at: str = Field(...) + code: Optional[int] = Field(None, description="Error code") + + +class TaskResult(BaseModel): + id: str = Field(..., description="Creation id") + url: str = Field(..., description="The URL of the generated results, valid for one hour") + cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour") + + +class TaskStatusResponse(BaseModel): + state: TaskStatus = Field(...) + err_code: Optional[str] = Field(None) + creations: list[TaskResult] = Field(..., description="Generated results") + + +async def poll_until_finished( + auth_kwargs: dict[str, str], + api_endpoint: ApiEndpoint[Any, R], + result_url_extractor: Optional[Callable[[R], str]] = None, + estimated_duration: Optional[int] = None, + node_id: Optional[str] = None, +) -> R: + return await PollingOperation( + poll_endpoint=api_endpoint, + completed_statuses=[TaskStatus.success.value], + failed_statuses=[TaskStatus.failed.value], + status_extractor=lambda response: response.state.value, + auth_kwargs=auth_kwargs, + result_url_extractor=result_url_extractor, + estimated_duration=estimated_duration, + node_id=node_id, + poll_interval=16.0, + max_poll_attempts=256, + ).execute() + + +def get_video_url_from_response(response) -> Optional[str]: + if response.creations: + return response.creations[0].url + return None + + +def get_video_from_response(response) -> TaskResult: + if not response.creations: + error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}" + logging.info(error_msg) + raise RuntimeError(error_msg) + logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url) + return response.creations[0] + + +async def execute_task( + vidu_endpoint: str, + auth_kwargs: Optional[dict[str, str]], + payload: TaskCreationRequest, + estimated_duration: int, + node_id: str, +) -> R: + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=vidu_endpoint, + method=HttpMethod.POST, + request_model=TaskCreationRequest, + response_model=TaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + if response.state == TaskStatus.failed: + error_msg = f"Vidu request failed. Code: {response.code}" + logging.error(error_msg) + raise RuntimeError(error_msg) + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=VIDU_GET_GENERATION_STATUS % response.task_id, + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=estimated_duration, + node_id=node_id, + ) + + +class ViduTextToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduTextToVideoNode", + display_name="Vidu Text To Video Generation", + category="api node/video/Vidu", + description="Generate video from text prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduImageToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduImageToVideoNode", + display_name="Vidu Image To Video Generation", + category="api node/video/Vidu", + description="Generate video from image and optional prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "image", + tooltip="An image to be used as the start frame of the generated video", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if get_number_of_images(image) > 1: + raise ValueError("Only one input image is allowed.") + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + image, + max_images=1, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduReferenceVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduReferenceVideoNode", + display_name="Vidu Reference To Video Generation", + category="api node/video/Vidu", + description="Generate video from multiple images and prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "images", + tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + images: torch.Tensor, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + a = get_number_of_images(images) + if a > 7: + raise ValueError("Too many images, maximum allowed is 7.") + for image in images: + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_dimensions(image, min_width=128, min_height=128) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + images, + max_images=7, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduStartEndToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduStartEndToVideoNode", + display_name="Vidu Start End To Video Generation", + category="api node/video/Vidu", + description="Generate a video from start and end frames and a prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "first_frame", + tooltip="Start frame", + ), + comfy_io.Image.Input( + "end_frame", + tooltip="End frame", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + first_frame: torch.Tensor, + end_frame: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = [ + (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0] + for frame in (first_frame, end_frame) + ] + results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + ViduTextToVideoNode, + ViduImageToVideoNode, + ViduReferenceVideoNode, + ViduStartEndToVideoNode, + ] + +async def comfy_entrypoint() -> ViduExtension: + return ViduExtension() diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index 031b9fbd3..606b794bf 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -53,6 +53,53 @@ def validate_image_aspect_ratio( ) +def validate_image_aspect_ratio_range( + image: torch.Tensor, + min_ratio: tuple[float, float], # e.g. (1, 4) + max_ratio: tuple[float, float], # e.g. (4, 1) + *, + strict: bool = True, # True -> (min, max); False -> [min, max] +) -> float: + a1, b1 = min_ratio + a2, b2 = max_ratio + if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0: + raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).") + lo, hi = (a1 / b1), (a2 / b2) + if lo > hi: + lo, hi = hi, lo + a1, b1, a2, b2 = a2, b2, a1, b1 # swap only for error text + w, h = get_image_dimensions(image) + if w <= 0 or h <= 0: + raise ValueError(f"Invalid image dimensions: {w}x{h}") + ar = w / h + ok = (lo < ar < hi) if strict else (lo <= ar <= hi) + if not ok: + op = "<" if strict else "≤" + raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}") + return ar + + +def validate_aspect_ratio_closeness( + start_img, + end_img, + min_rel: float, + max_rel: float, + *, + strict: bool = False, # True => exclusive, False => inclusive +) -> None: + w1, h1 = get_image_dimensions(start_img) + w2, h2 = get_image_dimensions(end_img) + if min(w1, h1, w2, h2) <= 0: + raise ValueError("Invalid image dimensions") + ar1 = w1 / h1 + ar2 = w2 / h2 + # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1) + closeness = max(ar1, ar2) / min(ar1, ar2) + limit = max(max_rel, 1.0 / min_rel) # for 0.8..1.25 this is 1.25 + if (closeness >= limit) if strict else (closeness > limit): + raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.") + + def validate_video_dimensions( video: VideoInput, min_width: Optional[int] = None, @@ -98,3 +145,9 @@ def validate_video_duration( raise ValueError( f"Video duration must be at most {max_duration}s, got {duration}s" ) + + +def get_number_of_images(images): + if isinstance(images, torch.Tensor): + return images.shape[0] if images.ndim >= 4 else 1 + return len(images) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index b5088fae2..fff89556f 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -1,25 +1,6 @@ import node_helpers import comfy.utils - -PREFERRED_QWENIMAGE_RESOLUTIONS = [ - (672, 1568), - (688, 1504), - (720, 1456), - (752, 1392), - (800, 1328), - (832, 1248), - (880, 1184), - (944, 1104), - (1024, 1024), - (1104, 944), - (1184, 880), - (1248, 832), - (1328, 800), - (1392, 752), - (1456, 720), - (1504, 688), - (1568, 672), -] +import math class TextEncodeQwenImageEdit: @@ -42,13 +23,17 @@ class TextEncodeQwenImageEdit: if image is None: images = [] else: - images = [image] + samples = image.movedim(-1, 1) + total = int(1024 * 1024) + + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + image = s.movedim(1, -1) + images = [image[:, :, :, :3]] if vae is not None: - width = image.shape[2] - height = image.shape[1] - aspect_ratio = width / height - _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS) - image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) ref_latent = vae.encode(image[:, :, :, :3]) tokens = clip.tokenize(prompt, images=images) diff --git a/nodes.py b/nodes.py index c16bbb666..94b10c259 100644 --- a/nodes.py +++ b/nodes.py @@ -2360,6 +2360,7 @@ async def init_builtin_api_nodes(): "nodes_moonvalley.py", "nodes_rodin.py", "nodes_gemini.py", + "nodes_vidu.py", ] if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):