ComfyUI/comfy_extras/nodes_boogu.py

import math

import node_helpers
import comfy.utils
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io


class TextEncodeBooguEdit(io.ComfyNode):
    """Boogu-Image Edit conditioning.

    The edit image is used twice, matching the reference pipeline:
      - Qwen3-VL vision tokens (instruction understanding) -> positive only
      - VAE reference latent (image identity)              -> positive and negative
    The ref latent is in both conds so it cancels under CFG (identity preserved);
    the vision tokens are only in the positive so CFG amplifies the instruction.
    The tokenizer selects the right system prompt automatically (image -> TI2I,
    empty negative -> DROP), so no template plumbing is needed here.
    """

    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="TextEncodeBooguEdit",
            category="model/conditioning/boogu",
            inputs=[
                io.Clip.Input("clip"),
                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
                io.Vae.Input("vae"),
                io.Autogrow.Input(
                    "images",
                    template=io.Autogrow.TemplateNames(
                        io.Image.Input("image"),
                        names=[f"image_{i}" for i in range(1, 17)],
                        min=1,
                    ),
                    tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
                ),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
            ],
        )

    @classmethod
    def execute(cls, clip, prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
        ref_latents = []
        images_vl = []

        images = images or {}
        for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
            image = images[name]
            if image is None:
                continue
            samples = image.movedim(-1, 1)

            # Vision tower input: the reference caps the VLM image at 384x384
            # (max_vlm_input_pil_pixels in pipeline_boogu.py).
            total = int(384 * 384)
            scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
            width = round(samples.shape[3] * scale_by)
            height = round(samples.shape[2] * scale_by)
            s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
            images_vl.append(s.movedim(1, -1)[:, :, :, :3])

            # Reference latent: align to 16 px (VAE /8 * patch_size 2).
            if vae is not None:
                total = int(1024 * 1024)
                scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
                width = round(samples.shape[3] * scale_by / 16.0) * 16
                height = round(samples.shape[2] * scale_by / 16.0) * 16
                s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
                ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))

        # positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
        positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
        negative = clip.encode_from_tokens_scheduled(clip.tokenize(""))

        if len(ref_latents) > 0:
            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)

        return io.NodeOutput(positive, negative)


class BooguExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            TextEncodeBooguEdit,
        ]


async def comfy_entrypoint() -> BooguExtension:
    return BooguExtension()