ComfyUI/comfy_api_nodes/nodes_bytedance_llm.py

"""API Nodes for ByteDance Seed LLM via the BytePlus ModelArk Responses API.

See: https://docs.byteplus.com/en/docs/ModelArk/1585128
"""

from typing_extensions import override

from comfy_api.latest import IO, ComfyExtension, Input
from comfy_api_nodes.apis.bytedance_llm import (
    BytePlusInputImage,
    BytePlusInputMessage,
    BytePlusInputText,
    BytePlusInputVideo,
    BytePlusMessageContent,
    BytePlusResponseCreateRequest,
    BytePlusResponseObject,
)
from comfy_api_nodes.util import (
    ApiEndpoint,
    get_number_of_images,
    sync_op,
    upload_images_to_comfyapi,
    upload_video_to_comfyapi,
    validate_string,
)

BYTEPLUS_RESPONSES_ENDPOINT = "/proxy/byteplus/api/v3/responses"
SEED_MAX_IMAGES = 20
SEED_MAX_VIDEOS = 4

SEED_MODELS: dict[str, str] = {
    "Seed 2.0 Pro": "seed-2-0-pro-260328",
    "Seed 2.0 Lite": "seed-2-0-lite-260228",
    "Seed 2.0 Mini": "seed-2-0-mini-260215",
}

# USD per 1M tokens: (input, cache_hit_input, output)
_SEED_PRICES_PER_MILLION: dict[str, tuple[float, float, float]] = {
    "seed-2-0-pro-260328": (0.50, 0.10, 3.00),
    "seed-2-0-lite-260228": (0.25, 0.05, 2.00),
    "seed-2-0-mini-260215": (0.10, 0.02, 0.40),
}


def _seed_model_inputs(max_images: int = SEED_MAX_IMAGES, max_videos: int = SEED_MAX_VIDEOS):
    return [
        IO.Autogrow.Input(
            "images",
            template=IO.Autogrow.TemplateNames(
                IO.Image.Input("image"),
                names=[f"image_{i}" for i in range(1, max_images + 1)],
                min=0,
            ),
            tooltip=f"Optional image(s) to use as context for the model. Up to {max_images} images.",
        ),
        IO.Autogrow.Input(
            "videos",
            template=IO.Autogrow.TemplateNames(
                IO.Video.Input("video"),
                names=[f"video_{i}" for i in range(1, max_videos + 1)],
                min=0,
            ),
            tooltip=f"Optional video(s) to use as context for the model. Up to {max_videos} videos.",
        ),
        IO.Float.Input(
            "temperature",
            default=1.0,
            min=0.0,
            max=2.0,
            step=0.01,
            tooltip="Controls randomness. 0.0 is deterministic, higher values are more random.",
            advanced=True,
        ),
    ]


def _calculate_price(model_id: str, response: BytePlusResponseObject) -> float | None:
    """Compute approximate USD price from response usage."""
    if not response.usage:
        return None
    rates = _SEED_PRICES_PER_MILLION.get(model_id)
    if rates is None:
        return None
    input_rate, cache_hit_rate, output_rate = rates
    input_tokens = response.usage.input_tokens or 0
    output_tokens = response.usage.output_tokens or 0
    cached = 0
    if response.usage.input_tokens_details:
        cached = response.usage.input_tokens_details.cached_tokens or 0
    fresh_input = max(0, input_tokens - cached)
    total = fresh_input * input_rate + cached * cache_hit_rate + output_tokens * output_rate
    return total / 1_000_000.0


def _get_text_from_response(response: BytePlusResponseObject) -> str:
    """Extract concatenated text from all assistant message output_text blocks."""
    if not response.output:
        return ""
    chunks: list[str] = []
    for item in response.output:
        if item.type != "message" or not item.content:
            continue
        for block in item.content:
            if block.type == "output_text" and block.text:
                chunks.append(block.text)
            elif block.type == "refusal" and block.refusal:
                raise ValueError(f"Model refused to respond: {block.refusal}")
    return "\n".join(chunks)


async def _build_image_content_blocks(
    cls: type[IO.ComfyNode],
    image_tensors: list[Input.Image],
) -> list[BytePlusInputImage]:
    urls = await upload_images_to_comfyapi(
        cls,
        image_tensors,
        max_images=SEED_MAX_IMAGES,
        wait_label="Uploading reference images",
    )
    return [BytePlusInputImage(image_url=url) for url in urls]


async def _build_video_content_blocks(
    cls: type[IO.ComfyNode],
    videos: list[Input.Video],
) -> list[BytePlusInputVideo]:
    blocks: list[BytePlusInputVideo] = []
    total = len(videos)
    for idx, video in enumerate(videos):
        label = "Uploading reference video"
        if total > 1:
            label = f"{label} ({idx + 1}/{total})"
        url = await upload_video_to_comfyapi(cls, video, wait_label=label)
        blocks.append(BytePlusInputVideo(video_url=url))
    return blocks


class ByteDanceSeedNode(IO.ComfyNode):
    """Generate text responses from a ByteDance Seed 2.0 model."""

    @classmethod
    def define_schema(cls):
        return IO.Schema(
            node_id="ByteDanceSeedNode",
            display_name="ByteDance Seed",
            category="api node/text/ByteDance",
            essentials_category="Text Generation",
            description="Generate text responses with ByteDance's Seed 2.0 models. "
            "Provide a text prompt and optionally one or more images or videos for multimodal context.",
            inputs=[
                IO.String.Input(
                    "prompt",
                    multiline=True,
                    default="",
                    tooltip="Text input to the model.",
                ),
                IO.DynamicCombo.Input(
                    "model",
                    options=[IO.DynamicCombo.Option(label, _seed_model_inputs()) for label in SEED_MODELS],
                    tooltip="The Seed model used to generate the response.",
                ),
                IO.Int.Input(
                    "seed",
                    default=0,
                    min=0,
                    max=2147483647,
                    control_after_generate=True,
                    tooltip="Seed controls whether the node should re-run; "
                    "results are non-deterministic regardless of seed.",
                ),
                IO.String.Input(
                    "system_prompt",
                    multiline=True,
                    default="",
                    optional=True,
                    advanced=True,
                    tooltip="Foundational instructions that dictate the model's behavior.",
                ),
            ],
            outputs=[IO.String.Output()],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
                IO.Hidden.unique_id,
            ],
            is_api_node=True,
            price_badge=IO.PriceBadge(
                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
                expr="""
                (
                  $m := widgets.model;
                  $contains($m, "mini") ? {
                    "type": "list_usd",
                    "usd": [0.00025, 0.0009],
                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
                  }
                  : $contains($m, "lite") ? {
                    "type": "list_usd",
                    "usd": [0.0003, 0.002],
                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
                  }
                  : $contains($m, "pro") ? {
                    "type": "list_usd",
                    "usd": [0.0005, 0.003],
                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
                  }
                  : {"type":"text", "text":"Token-based"}
                )
                """,
            ),
        )

    @classmethod
    async def execute(
        cls,
        prompt: str,
        model: dict,
        seed: int,
        system_prompt: str = "",
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=True, min_length=1)
        model_label = model["model"]
        temperature = model["temperature"]
        model_id = SEED_MODELS[model_label]

        image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None]
        if sum(get_number_of_images(t) for t in image_tensors) > SEED_MAX_IMAGES:
            raise ValueError(f"Up to {SEED_MAX_IMAGES} images are supported per request.")

        video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None]
        if len(video_inputs) > SEED_MAX_VIDEOS:
            raise ValueError(f"Up to {SEED_MAX_VIDEOS} videos are supported per request.")

        content: list[BytePlusMessageContent] = []
        if image_tensors:
            content.extend(await _build_image_content_blocks(cls, image_tensors))
        if video_inputs:
            content.extend(await _build_video_content_blocks(cls, video_inputs))
        content.append(BytePlusInputText(text=prompt))

        response = await sync_op(
            cls,
            ApiEndpoint(path=BYTEPLUS_RESPONSES_ENDPOINT, method="POST"),
            response_model=BytePlusResponseObject,
            data=BytePlusResponseCreateRequest(
                model=model_id,
                input=[BytePlusInputMessage(role="user", content=content)],
                instructions=system_prompt or None,
                temperature=temperature,
                store=False,
                stream=False,
            ),
            price_extractor=lambda r: _calculate_price(model_id, r),
        )
        if response.error:
            raise ValueError(f"Seed API error ({response.error.code}): {response.error.message}")
        result = _get_text_from_response(response)
        if not result:
            raise ValueError("Empty response from Seed model.")
        return IO.NodeOutput(result)


class ByteDanceLLMExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [ByteDanceSeedNode]


async def comfy_entrypoint() -> ByteDanceLLMExtension:
    return ByteDanceLLMExtension()