ComfyUI/comfy_api_nodes/nodes_minimax.py
octo-patch 3c5f87b270 feat(api-nodes): add MiniMax Chat and TTS nodes
- Add MinimaxChatNode for text generation using MiniMax-M2.7 and
  MiniMax-M2.7-highspeed models via OpenAI-compatible API
- Add MinimaxTextToSpeechNode for speech synthesis using speech-2.8-hd
  and speech-2.8-turbo models via MiniMax TTS API
- Add Pydantic request/response models for Chat and TTS APIs to
  comfy_api_nodes/apis/minimax.py
- Add unit tests for all new API models
2026-04-12 18:52:48 +08:00

661 lines
22 KiB
Python

from typing import Optional
import torch
from typing_extensions import override
from comfy_api.latest import IO, ComfyExtension
from comfy_api_nodes.apis.minimax import (
MinimaxChatMessage,
MinimaxChatRequest,
MinimaxChatResponse,
MinimaxFileRetrieveResponse,
MiniMaxModel,
MinimaxTaskResultResponse,
MinimaxTTSAudioSetting,
MinimaxTTSRequest,
MinimaxTTSVoiceSetting,
MinimaxVideoGenerationRequest,
MinimaxVideoGenerationResponse,
SubjectReferenceItem,
)
from comfy_api_nodes.util import (
ApiEndpoint,
audio_bytes_to_audio_input,
download_url_to_video_output,
poll_op,
sync_op,
sync_op_raw,
upload_images_to_comfyapi,
validate_string,
)
I2V_AVERAGE_DURATION = 114
T2V_AVERAGE_DURATION = 234
async def _generate_mm_video(
cls: type[IO.ComfyNode],
*,
prompt_text: str,
seed: int,
model: str,
image: Optional[torch.Tensor] = None, # used for ImageToVideo
subject: Optional[torch.Tensor] = None, # used for SubjectToVideo
average_duration: Optional[int] = None,
) -> IO.NodeOutput:
if image is None:
validate_string(prompt_text, field_name="prompt_text")
image_url = None
if image is not None:
image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0]
# TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model
subject_reference = None
if subject is not None:
subject_url = (await upload_images_to_comfyapi(cls, subject, max_images=1))[0]
subject_reference = [SubjectReferenceItem(image=subject_url)]
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"),
response_model=MinimaxVideoGenerationResponse,
data=MinimaxVideoGenerationRequest(
model=MiniMaxModel(model),
prompt=prompt_text,
callback_url=None,
first_frame_image=image_url,
subject_reference=subject_reference,
prompt_optimizer=None,
),
)
task_id = response.task_id
if not task_id:
raise Exception(f"MiniMax generation failed: {response.base_resp}")
task_result = await poll_op(
cls,
ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}),
response_model=MinimaxTaskResultResponse,
status_extractor=lambda x: x.status.value,
estimated_duration=average_duration,
)
file_id = task_result.file_id
if file_id is None:
raise Exception("Request was not successful. Missing file ID.")
file_result = await sync_op(
cls,
ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}),
response_model=MinimaxFileRetrieveResponse,
)
file_url = file_result.file.download_url
if file_url is None:
raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}")
if file_result.file.backup_download_url:
try:
return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2))
except Exception: # if we have a second URL to retrieve the result, try again using that one
return IO.NodeOutput(
await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3)
)
return IO.NodeOutput(await download_url_to_video_output(file_url))
class MinimaxTextToVideoNode(IO.ComfyNode):
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxTextToVideoNode",
display_name="MiniMax Text to Video",
category="api node/video/MiniMax",
description="Generates videos synchronously based on a prompt, and optional parameters.",
inputs=[
IO.String.Input(
"prompt_text",
multiline=True,
default="",
tooltip="Text prompt to guide the video generation",
),
IO.Combo.Input(
"model",
options=["T2V-01", "T2V-01-Director"],
default="T2V-01",
tooltip="Model to use for video generation",
),
IO.Int.Input(
"seed",
default=0,
min=0,
max=0xFFFFFFFFFFFFFFFF,
step=1,
control_after_generate=True,
tooltip="The random seed used for creating the noise.",
optional=True,
),
],
outputs=[IO.Video.Output()],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.43}""",
),
)
@classmethod
async def execute(
cls,
prompt_text: str,
model: str = "T2V-01",
seed: int = 0,
) -> IO.NodeOutput:
return await _generate_mm_video(
cls,
prompt_text=prompt_text,
seed=seed,
model=model,
image=None,
subject=None,
average_duration=T2V_AVERAGE_DURATION,
)
class MinimaxImageToVideoNode(IO.ComfyNode):
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxImageToVideoNode",
display_name="MiniMax Image to Video",
category="api node/video/MiniMax",
description="Generates videos synchronously based on an image and prompt, and optional parameters.",
inputs=[
IO.Image.Input(
"image",
tooltip="Image to use as first frame of video generation",
),
IO.String.Input(
"prompt_text",
multiline=True,
default="",
tooltip="Text prompt to guide the video generation",
),
IO.Combo.Input(
"model",
options=["I2V-01-Director", "I2V-01", "I2V-01-live"],
default="I2V-01",
tooltip="Model to use for video generation",
),
IO.Int.Input(
"seed",
default=0,
min=0,
max=0xFFFFFFFFFFFFFFFF,
step=1,
control_after_generate=True,
tooltip="The random seed used for creating the noise.",
optional=True,
),
],
outputs=[IO.Video.Output()],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.43}""",
),
)
@classmethod
async def execute(
cls,
image: torch.Tensor,
prompt_text: str,
model: str = "I2V-01",
seed: int = 0,
) -> IO.NodeOutput:
return await _generate_mm_video(
cls,
prompt_text=prompt_text,
seed=seed,
model=model,
image=image,
subject=None,
average_duration=I2V_AVERAGE_DURATION,
)
class MinimaxSubjectToVideoNode(IO.ComfyNode):
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxSubjectToVideoNode",
display_name="MiniMax Subject to Video",
category="api node/video/MiniMax",
description="Generates videos synchronously based on an image and prompt, and optional parameters.",
inputs=[
IO.Image.Input(
"subject",
tooltip="Image of subject to reference for video generation",
),
IO.String.Input(
"prompt_text",
multiline=True,
default="",
tooltip="Text prompt to guide the video generation",
),
IO.Combo.Input(
"model",
options=["S2V-01"],
default="S2V-01",
tooltip="Model to use for video generation",
),
IO.Int.Input(
"seed",
default=0,
min=0,
max=0xFFFFFFFFFFFFFFFF,
step=1,
control_after_generate=True,
tooltip="The random seed used for creating the noise.",
optional=True,
),
],
outputs=[IO.Video.Output()],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
)
@classmethod
async def execute(
cls,
subject: torch.Tensor,
prompt_text: str,
model: str = "S2V-01",
seed: int = 0,
) -> IO.NodeOutput:
return await _generate_mm_video(
cls,
prompt_text=prompt_text,
seed=seed,
model=model,
image=None,
subject=subject,
average_duration=T2V_AVERAGE_DURATION,
)
class MinimaxHailuoVideoNode(IO.ComfyNode):
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxHailuoVideoNode",
display_name="MiniMax Hailuo Video",
category="api node/video/MiniMax",
description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.",
inputs=[
IO.String.Input(
"prompt_text",
multiline=True,
default="",
tooltip="Text prompt to guide the video generation.",
),
IO.Int.Input(
"seed",
default=0,
min=0,
max=0xFFFFFFFFFFFFFFFF,
step=1,
control_after_generate=True,
tooltip="The random seed used for creating the noise.",
optional=True,
),
IO.Image.Input(
"first_frame_image",
tooltip="Optional image to use as the first frame to generate a video.",
optional=True,
),
IO.Boolean.Input(
"prompt_optimizer",
default=True,
tooltip="Optimize prompt to improve generation quality when needed.",
optional=True,
),
IO.Combo.Input(
"duration",
options=[6, 10],
default=6,
tooltip="The length of the output video in seconds.",
optional=True,
),
IO.Combo.Input(
"resolution",
options=["768P", "1080P"],
default="768P",
tooltip="The dimensions of the video display. 1080p is 1920x1080, 768p is 1366x768.",
optional=True,
),
],
outputs=[IO.Video.Output()],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
depends_on=IO.PriceBadgeDepends(widgets=["resolution", "duration"]),
expr="""
(
$prices := {
"768p": {"6": 0.28, "10": 0.56},
"1080p": {"6": 0.49}
};
$resPrices := $lookup($prices, $lowercase(widgets.resolution));
$price := $lookup($resPrices, $string(widgets.duration));
{"type":"usd","usd": $price ? $price : 0.43}
)
""",
),
)
@classmethod
async def execute(
cls,
prompt_text: str,
seed: int = 0,
first_frame_image: Optional[torch.Tensor] = None, # used for ImageToVideo
prompt_optimizer: bool = True,
duration: int = 6,
resolution: str = "768P",
model: str = "MiniMax-Hailuo-02",
) -> IO.NodeOutput:
if first_frame_image is None:
validate_string(prompt_text, field_name="prompt_text")
if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6:
raise Exception(
"When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds."
)
# upload image, if passed in
image_url = None
if first_frame_image is not None:
image_url = (await upload_images_to_comfyapi(cls, first_frame_image, max_images=1))[0]
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"),
response_model=MinimaxVideoGenerationResponse,
data=MinimaxVideoGenerationRequest(
model=MiniMaxModel(model),
prompt=prompt_text,
callback_url=None,
first_frame_image=image_url,
prompt_optimizer=prompt_optimizer,
duration=duration,
resolution=resolution,
),
)
task_id = response.task_id
if not task_id:
raise Exception(f"MiniMax generation failed: {response.base_resp}")
average_duration = 120 if resolution == "768P" else 240
task_result = await poll_op(
cls,
ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}),
response_model=MinimaxTaskResultResponse,
status_extractor=lambda x: x.status.value,
estimated_duration=average_duration,
)
file_id = task_result.file_id
if file_id is None:
raise Exception("Request was not successful. Missing file ID.")
file_result = await sync_op(
cls,
ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}),
response_model=MinimaxFileRetrieveResponse,
)
file_url = file_result.file.download_url
if file_url is None:
raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}")
if file_result.file.backup_download_url:
try:
return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2))
except Exception: # if we have a second URL to retrieve the result, try again using that one
return IO.NodeOutput(
await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3)
)
return IO.NodeOutput(await download_url_to_video_output(file_url))
MINIMAX_CHAT_MODELS = ["MiniMax-M2.7", "MiniMax-M2.7-highspeed"]
MINIMAX_TTS_MODELS = ["speech-2.8-hd", "speech-2.8-turbo"]
MINIMAX_TTS_VOICES = [
"English_Graceful_Lady",
"English_Insightful_Speaker",
"English_radiant_girl",
"English_Persuasive_Man",
"English_Lucky_Robot",
"English_expressive_narrator",
]
class MinimaxChatNode(IO.ComfyNode):
"""
Node to generate text responses from a MiniMax chat model.
"""
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxChatNode",
display_name="MiniMax Chat",
category="api node/text/MiniMax",
description="Generate text responses using MiniMax chat models (M2.7 series).",
inputs=[
IO.String.Input(
"prompt",
multiline=True,
default="",
tooltip="Text prompt to send to the MiniMax chat model.",
),
IO.Combo.Input(
"model",
options=MINIMAX_CHAT_MODELS,
default="MiniMax-M2.7",
tooltip="The MiniMax chat model to use. MiniMax-M2.7-highspeed is faster with the same performance.",
),
IO.String.Input(
"system_prompt",
multiline=True,
default="",
optional=True,
tooltip="Optional system prompt to set the behavior of the assistant.",
advanced=True,
),
IO.Int.Input(
"max_tokens",
default=1024,
min=1,
max=8192,
step=1,
optional=True,
tooltip="Maximum number of tokens to generate.",
advanced=True,
),
],
outputs=[
IO.String.Output(),
],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"text","text":"Token-based"}""",
),
)
@classmethod
async def execute(
cls,
prompt: str,
model: str = "MiniMax-M2.7",
system_prompt: str = "",
max_tokens: int = 1024,
) -> IO.NodeOutput:
validate_string(prompt, strip_whitespace=False)
messages: list[MinimaxChatMessage] = []
if system_prompt.strip():
messages.append(MinimaxChatMessage(role="system", content=system_prompt))
messages.append(MinimaxChatMessage(role="user", content=prompt))
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/minimax/v1/chat/completions", method="POST"),
response_model=MinimaxChatResponse,
data=MinimaxChatRequest(
model=model,
messages=messages,
temperature=1.0,
max_tokens=max_tokens,
stream=False,
),
)
if not response.choices:
return IO.NodeOutput("Empty response from MiniMax model.")
return IO.NodeOutput(response.choices[0].message.content)
class MinimaxTextToSpeechNode(IO.ComfyNode):
"""
Node to convert text to speech using MiniMax TTS API.
"""
@classmethod
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="MinimaxTextToSpeechNode",
display_name="MiniMax Text to Speech",
category="api node/audio/MiniMax",
description="Convert text to speech using MiniMax TTS models.",
inputs=[
IO.String.Input(
"text",
multiline=True,
default="",
tooltip="The text to synthesize into speech.",
),
IO.Combo.Input(
"voice",
options=MINIMAX_TTS_VOICES,
default="English_Graceful_Lady",
tooltip="The voice to use for speech synthesis.",
),
IO.Combo.Input(
"model",
options=MINIMAX_TTS_MODELS,
default="speech-2.8-hd",
tooltip="TTS model to use. speech-2.8-hd is higher quality; speech-2.8-turbo is faster.",
),
IO.Float.Input(
"speed",
default=1.0,
min=0.5,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
optional=True,
tooltip="Speech speed. 1.0 is normal speed.",
advanced=True,
),
],
outputs=[
IO.Audio.Output(),
],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"text","text":"Character-based"}""",
),
)
@classmethod
async def execute(
cls,
text: str,
voice: str = "English_Graceful_Lady",
model: str = "speech-2.8-hd",
speed: float = 1.0,
) -> IO.NodeOutput:
validate_string(text, min_length=1)
response_bytes = await sync_op_raw(
cls,
ApiEndpoint(path="/proxy/minimax/v1/t2a_v2", method="POST"),
data=MinimaxTTSRequest(
model=model,
text=text,
stream=False,
voice_setting=MinimaxTTSVoiceSetting(
voice_id=voice,
speed=speed,
vol=1.0,
pitch=0,
),
audio_setting=MinimaxTTSAudioSetting(
sample_rate=32000,
bitrate=128000,
format="mp3",
channel=1,
),
),
as_binary=True,
)
return IO.NodeOutput(audio_bytes_to_audio_input(response_bytes))
class MinimaxExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
return [
MinimaxTextToVideoNode,
MinimaxImageToVideoNode,
# MinimaxSubjectToVideoNode,
MinimaxHailuoVideoNode,
MinimaxChatNode,
MinimaxTextToSpeechNode,
]
async def comfy_entrypoint() -> MinimaxExtension:
return MinimaxExtension()