feat: Ideogram structured-caption nodes

2026-07-14 10:27:17 +08:00 · 2026-06-15 14:32:21 -04:00 · 2026-06-15 14:32:21 -04:00 · 850c91389f
commit 850c91389f
parent 7d4194d984
6 changed files with 444 additions and 0 deletions
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@ -891,6 +891,14 @@ class Tracks(ComfyTypeIO):
        track_visibility: torch.Tensor
    Type = TrackDict

+@comfytype(io_type="COMFY_DICT")
+class ComfyDict(ComfyTypeIO):
+    Type = dict
+
+@comfytype(io_type="COMFY_LIST")
+class ComfyList(ComfyTypeIO):
+    Type = list
+
@comfytype(io_type="COMFY_MULTITYPED_V3")
 class MultiType:
    Type = Any
@ -1326,6 +1334,32 @@ class Curve(ComfyTypeIO):
            return d


+@comfytype(io_type="COLORS")
+class Colors(ComfyTypeIO):
+    Type = list[Color.Type]
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: list[str]=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = []
+
+
+@comfytype(io_type="BOUNDING_BOXES")
+class BoundingBoxes(ComfyTypeIO):
+    class BoundingBoxWithMetadata(BoundingBox.BoundingBoxDict):
+        metadata: dict
+    Type = list[BoundingBoxWithMetadata]
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: list[dict]=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = []
+
+
@comfytype(io_type="HISTOGRAM")
 class Histogram(ComfyTypeIO):
    """A histogram represented as a list of bin counts."""
@ -2376,6 +2410,8 @@ __all__ = [
    "AnyType",
    "MultiType",
    "Tracks",
+    "ComfyDict",
+    "ComfyList",
    "Color",
    # Dynamic Types
    "MatchType",
@ -2394,6 +2430,8 @@ __all__ = [
    "PriceBadgeDepends",
    "PriceBadge",
    "BoundingBox",
+    "BoundingBoxes",
+    "Colors",
    "Curve",
    "Histogram",
    "Range",
--- a/comfy_extras/color_util.py
+++ b/comfy_extras/color_util.py
@ -0,0 +1,23 @@
+def hex_to_rgb(value: str) -> tuple[int, int, int]:
+    h = value.lstrip("#")
+    if len(h) != 6:
+        return (255, 255, 255)
+    try:
+        return (int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16))
+    except ValueError:
+        return (255, 255, 255)
+
+
+def readable_color(rgb: tuple[int, int, int]) -> tuple[int, int, int]:
+    r, g, b = rgb
+    lum = 0.299 * r + 0.587 * g + 0.114 * b
+    if lum >= 130:
+        return (r, g, b)
+    t = (130 - lum) / (255 - lum)
+    return (round(r + (255 - r) * t), round(g + (255 - g) * t), round(b + (255 - b) * t))
+
+
+def normalize_palette(colors) -> list[str]:
+    if isinstance(colors, dict):
+        colors = colors.values()
+    return [c.upper() for c in colors if isinstance(c, str) and c]
--- a/comfy_extras/nodes_bounding_boxes.py
+++ b/comfy_extras/nodes_bounding_boxes.py
@ -0,0 +1,252 @@
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io
+from comfy_extras.color_util import hex_to_rgb, normalize_palette, readable_color
+
+_PREVIEW_LONG_EDGE = 1024
+_PREVIEW_DIM = 0.25
+
+
+def pixels_to_fractions(box: dict, width: int, height: int) -> dict:
+    w = width or 1
+    h = height or 1
+    return {
+        "x": box.get("x", 0) / w,
+        "y": box.get("y", 0) / h,
+        "w": box.get("width", 0) / w,
+        "h": box.get("height", 0) / h,
+    }
+
+
+def fractions_to_pixels(box: dict, width: int, height: int) -> dict:
+    x, y = box.get("x", 0.0), box.get("y", 0.0)
+    w, h = box.get("w", 0.0), box.get("h", 0.0)
+    if w < 0:
+        x, w = x + w, -w
+    if h < 0:
+        y, h = y + h, -h
+    return {
+        "x": round(x * width),
+        "y": round(y * height),
+        "width": round(w * width),
+        "height": round(h * height),
+    }
+
+
+def fractions_to_bbox_frame(boxes: list, width: int, height: int) -> list:
+    pixels = [
+        fractions_to_pixels(box, width, height)
+        for box in boxes
+        if isinstance(box, dict)
+    ]
+    return [pixels] if pixels else []
+
+
+def _font(size: int):
+    try:
+        return ImageFont.load_default(size)
+    except Exception:
+        return ImageFont.load_default()
+
+
+def _wrap(draw, text: str, font, max_w: float) -> list[str]:
+    lines = []
+    for para in text.split("\n"):
+        line = ""
+        for word in para.split():
+            test = word if not line else line + " " + word
+            if line and draw.textlength(test, font=font) > max_w:
+                lines.append(line)
+                line = word
+            else:
+                line = test
+        lines.append(line)
+    return lines
+
+
+def _bg_from_image(image) -> Image.Image | None:
+    if image is None:
+        return None
+    try:
+        arr = (image[0].detach().cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+        return Image.fromarray(arr)
+    except Exception:
+        return None
+
+
+def render_preview(regions, width, height, bg=None):
+    if bg is not None:
+        iw, ih = bg.size
+        long_edge = max(iw, ih) or 1
+        scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge)
+        rw, rh = max(1, round(iw * scale)), max(1, round(ih * scale))
+        base = bg.convert("RGB").resize((rw, rh), Image.LANCZOS)
+        base = ImageEnhance.Brightness(base).enhance(_PREVIEW_DIM)
+        img = base.convert("RGBA")
+    else:
+        long_edge = max(width, height) or 1
+        scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge)
+        rw, rh = max(1, round(width * scale)), max(1, round(height * scale))
+        grey = round(_PREVIEW_DIM * 128)
+        img = Image.new("RGBA", (rw, rh), (grey, grey, grey, 255))
+
+    overlay = Image.new("RGBA", (rw, rh), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    fs = max(10, round(rh / 64))
+    font = _font(fs)
+    tag_font = _font(max(9, fs - 2))
+    line_h = fs + 2
+
+    for i, region in enumerate(regions):
+        if not isinstance(region, dict):
+            continue
+        palette = [c for c in (region.get("palette") or []) if c]
+        r, g, b = hex_to_rgb(palette[0]) if palette else (140, 140, 140)
+        x1 = max(0, min(rw, round(region.get("x", 0) * rw)))
+        y1 = max(0, min(rh, round(region.get("y", 0) * rh)))
+        x2 = max(0, min(rw, round((region.get("x", 0) + region.get("w", 0)) * rw)))
+        y2 = max(0, min(rh, round((region.get("y", 0) + region.get("h", 0)) * rh)))
+        if x2 < x1:
+            x1, x2 = x2, x1
+        if y2 < y1:
+            y1, y2 = y2, y1
+
+        draw.rectangle([x1, y1, x2, y2], outline=(r, g, b, 255), width=2)
+
+        swatches = palette[:5]
+        if swatches and (x2 - x1) > 2:
+            sh = max(5, fs // 2)
+            seg = (x2 - x1) / len(swatches)
+            for p, hexc in enumerate(swatches):
+                sx = x1 + round(p * seg)
+                draw.rectangle([sx, y1, x1 + round((p + 1) * seg), y1 + sh], fill=hex_to_rgb(hexc))
+
+        etype = "text" if region.get("type") == "text" else "obj"
+        tag = str(i + 1).zfill(2)
+        tw = draw.textlength(tag, font=tag_font)
+        draw.rectangle([x1, y1, x1 + tw + 6, y1 + fs + 2], fill=(r, g, b, 255))
+        tag_fill = (0, 0, 0, 255) if (0.299 * r + 0.587 * g + 0.114 * b) > 140 else (255, 255, 255, 255)
+        draw.text((x1 + 3, y1 + 1), tag, fill=tag_fill, font=tag_font)
+
+        body = region.get("desc", "") or ""
+        if etype == "text" and region.get("text"):
+            body = '"%s"%s' % (region["text"], " — " + body if body else "")
+        if body and (x2 - x1) > 8:
+            ty = y1 + fs + 5
+            for line in _wrap(draw, body, font, x2 - x1 - 8):
+                if ty > y2:
+                    break
+                draw.text((x1 + 4, ty), line, fill=readable_color((r, g, b)) + (255,), font=font)
+                ty += line_h
+
+    composed = Image.alpha_composite(img, overlay).convert("RGB")
+    arr = np.asarray(composed, dtype=np.float32) / 255.0
+    return torch.from_numpy(arr).unsqueeze(0)
+
+
+def boxes_to_regions(boxes, width: int, height: int) -> list:
+    regions: list = []
+    if not isinstance(boxes, list):
+        return regions
+    for box in boxes:
+        if not isinstance(box, dict):
+            continue
+        meta = box.get("metadata")
+        meta = meta if isinstance(meta, dict) else {}
+        regions.append({
+            **pixels_to_fractions(box, width, height),
+            "type": meta.get("type", "obj"),
+            "text": meta.get("text", ""),
+            "desc": meta.get("desc", ""),
+            "palette": meta.get("palette", []),
+        })
+    return regions
+
+
+def _norm_bbox(region: dict) -> list[int]:
+    def grid(value: float) -> int:
+        return max(0, min(1000, round(value * 1000)))
+
+    x, y = region.get("x", 0.0), region.get("y", 0.0)
+    w, h = region.get("w", 0.0), region.get("h", 0.0)
+    ymin, xmin, ymax, xmax = grid(y), grid(x), grid(y + h), grid(x + w)
+    if ymin > ymax:
+        ymin, ymax = ymax, ymin
+    if xmin > xmax:
+        xmin, xmax = xmax, xmin
+    return [ymin, xmin, ymax, xmax]
+
+
+def build_elements(regions: list) -> list:
+    elements = []
+    for region in regions:
+        if not isinstance(region, dict):
+            continue
+        etype = "text" if region.get("type") == "text" else "obj"
+        element = {"type": etype}
+        element["bbox"] = _norm_bbox(region)
+        if etype == "text":
+            element["text"] = region.get("text", "")
+        element["desc"] = region.get("desc", "")
+        palette = normalize_palette(region.get("palette", []))
+        if palette:
+            element["color_palette"] = palette[:5]
+        elements.append(element)
+    return elements
+
+
+class CreateBoundingBoxes(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        editor_state = io.BoundingBoxes.Input(
+            "editor_state",
+            tooltip="Draw regions and set each region's type/text/desc/palette.",
+        )
+        return io.Schema(
+            node_id="CreateBoundingBoxes",
+            display_name="Create Bounding Boxes",
+            category="utilities",
+            description="Draw regions over a reference image. Outputs Ideogram caption elements, pixel-space bounding boxes, and a rendered preview.",
+            inputs=[
+                io.Image.Input(
+                    "background",
+                    optional=True,
+                    tooltip="Optional reference image shown behind the canvas and preview.",
+                ),
+                io.Int.Input("width", default=1024, min=64, max=16384, step=16,
+                             tooltip="Canvas aspect width and the pixel grid for the bbox output."),
+                io.Int.Input("height", default=1024, min=64, max=16384, step=16,
+                             tooltip="Canvas aspect height and the pixel grid for the bbox output."),
+                editor_state,
+            ],
+            outputs=[
+                io.Image.Output(display_name="preview"),
+                io.BoundingBox.Output(display_name="bboxes"),
+                io.ComfyList.Output(display_name="elements"),
+            ],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, width, height, editor_state=None, background=None) -> io.NodeOutput:
+        regions = boxes_to_regions(editor_state, width, height)
+        preview = render_preview(regions, width, height, _bg_from_image(background))
+        return io.NodeOutput(
+            preview,
+            fractions_to_bbox_frame(regions, width, height),
+            build_elements(regions),
+            ui={"dims": [width, height]},
+        )
+
+
+class BoundingBoxesExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [CreateBoundingBoxes]
+
+
+async def comfy_entrypoint() -> BoundingBoxesExtension:
+    return BoundingBoxesExtension()
--- a/comfy_extras/nodes_json_prompt.py
+++ b/comfy_extras/nodes_json_prompt.py
@ -0,0 +1,76 @@
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io
+from comfy_extras.color_util import normalize_palette
+
+
+class BuildJsonPromptIdeogram(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        color_palette = io.Colors.Input(
+            "color_palette",
+            tooltip="Style color palette.",
+        )
+        return io.Schema(
+            node_id="BuildJsonPromptIdeogram",
+            display_name="Build JSON Prompt (Ideogram)",
+            category="image/ideogram",
+            description="Assemble the Ideogram 4 caption from Create Bounding Boxes elements plus the background and style fields.",
+            inputs=[
+                io.ComfyList.Input("element", tooltip="Caption elements from Create Bounding Boxes."),
+                io.String.Input("high_level_description", multiline=True, default="",
+                                tooltip="Optional one-line overview of the whole image (blank = omitted)."),
+                io.String.Input("background", multiline=True, default="",
+                                tooltip="Scene background description."),
+                io.DynamicCombo.Input("style", options=[
+                    io.DynamicCombo.Option("none", []),
+                    io.DynamicCombo.Option("photo", [io.String.Input("photo", default="")]),
+                    io.DynamicCombo.Option("art_style", [io.String.Input("art_style", default="")]),
+                ]),
+                io.String.Input("aesthetics", default="", tooltip="Style descriptor. Sent even when blank once a style is chosen."),
+                io.String.Input("lighting", default="", tooltip="Style descriptor. Sent even when blank once a style is chosen."),
+                io.String.Input("medium", default="", tooltip="Style descriptor. Sent even when blank once a style is chosen."),
+                color_palette,
+            ],
+            outputs=[io.ComfyDict.Output(display_name="prompt")],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, element, style, high_level_description="", background="",
+                aesthetics="", lighting="", medium="", color_palette=None) -> io.NodeOutput:
+        elements = element if isinstance(element, list) else []
+        kind = style.get("style", "none") if isinstance(style, dict) else "none"
+        photo = style.get("photo", "") if isinstance(style, dict) else ""
+        art_style = style.get("art_style", "") if isinstance(style, dict) else ""
+        palette = normalize_palette(color_palette or [])
+
+        caption: dict = {}
+        if high_level_description.strip():
+            caption["high_level_description"] = high_level_description
+        if kind != "none":
+            style_desc: dict = {"aesthetics": aesthetics, "lighting": lighting}
+            if kind == "photo":
+                style_desc["photo"] = photo
+                style_desc["medium"] = medium
+            else:
+                style_desc["medium"] = medium
+                style_desc["art_style"] = art_style
+            if palette:
+                style_desc["color_palette"] = palette
+            caption["style_description"] = style_desc
+        caption["compositional_deconstruction"] = {
+            "background": background,
+            "elements": elements,
+        }
+        return io.NodeOutput(caption)
+
+
+class JsonPromptExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [BuildJsonPromptIdeogram]
+
+
+async def comfy_entrypoint() -> JsonPromptExtension:
+    return JsonPromptExtension()
--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@ -440,6 +440,57 @@ class JsonExtractString(io.ComfyNode):
        except (json.JSONDecodeError, TypeError):
            return io.NodeOutput("")

+
+def _dump_json(value, indent):
+    return json.dumps(value, ensure_ascii=False, indent=indent or None)
+
+
+class DictToJsonString(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DictToJsonString",
+            display_name="Dict to JSON String",
+            category="text",
+            search_aliases=["json", "dict to json", "stringify", "serialize", "dict to string"],
+            inputs=[
+                io.ComfyDict.Input("value"),
+                io.Int.Input("indent", default=2, min=0, max=8,
+                             tooltip="Spaces per indent level. 0 produces compact single-line JSON."),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, value, indent=2):
+        return io.NodeOutput(_dump_json(value, indent))
+
+
+class ListToJsonString(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ListToJsonString",
+            display_name="List to JSON String",
+            category="text",
+            search_aliases=["json", "list to json", "stringify", "serialize", "list to string", "array to json"],
+            inputs=[
+                io.ComfyList.Input("value"),
+                io.Int.Input("indent", default=2, min=0, max=8,
+                             tooltip="Spaces per indent level. 0 produces compact single-line JSON."),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, value, indent=2):
+        return io.NodeOutput(_dump_json(value, indent))
+
+
 class StringExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -457,6 +508,8 @@ class StringExtension(ComfyExtension):
            RegexExtract,
            RegexReplace,
            JsonExtractString,
+            DictToJsonString,
+            ListToJsonString,
        ]

 async def comfy_entrypoint() -> StringExtension:
--- a/nodes.py
+++ b/nodes.py
@ -2363,6 +2363,8 @@ async def init_builtin_extra_nodes():
        "nodes_images.py",
        "nodes_video_model.py",
        "nodes_ideogram4.py",
+        "nodes_bounding_boxes.py",
+        "nodes_json_prompt.py",
        "nodes_train.py",
        "nodes_dataset.py",
        "nodes_sag.py",