mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-18 22:09:38 +08:00
98 lines
4.1 KiB
Python
98 lines
4.1 KiB
Python
import math
|
|
|
|
import node_helpers
|
|
import comfy.utils
|
|
from typing_extensions import override
|
|
from comfy_api.latest import ComfyExtension, io
|
|
|
|
|
|
class TextEncodeBooguEdit(io.ComfyNode):
|
|
"""Boogu-Image Edit conditioning.
|
|
|
|
The edit image is used twice, matching the reference pipeline:
|
|
- Qwen3-VL vision tokens (instruction understanding) -> positive only
|
|
- VAE reference latent (image identity) -> positive and negative
|
|
The ref latent is in both conds so it cancels under CFG (identity preserved);
|
|
the vision tokens are only in the positive so CFG amplifies the instruction.
|
|
The tokenizer selects the right system prompt automatically (image -> TI2I,
|
|
empty negative -> DROP), so no template plumbing is needed here.
|
|
"""
|
|
|
|
@classmethod
|
|
def define_schema(cls):
|
|
return io.Schema(
|
|
node_id="TextEncodeBooguEdit",
|
|
category="model/conditioning/boogu",
|
|
inputs=[
|
|
io.Clip.Input("clip"),
|
|
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
|
io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True),
|
|
io.Vae.Input("vae"),
|
|
io.Autogrow.Input(
|
|
"images",
|
|
template=io.Autogrow.TemplateNames(
|
|
io.Image.Input("image"),
|
|
names=[f"image_{i}" for i in range(1, 17)],
|
|
min=0,
|
|
),
|
|
tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
|
|
),
|
|
],
|
|
outputs=[
|
|
io.Conditioning.Output(display_name="positive"),
|
|
io.Conditioning.Output(display_name="negative"),
|
|
],
|
|
)
|
|
|
|
@classmethod
|
|
def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
|
|
ref_latents = []
|
|
images_vl = []
|
|
|
|
images = images or {}
|
|
for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
|
|
image = images[name]
|
|
if image is None:
|
|
continue
|
|
samples = image.movedim(-1, 1)
|
|
|
|
# Vision tower input: the reference caps the VLM image at 384x384
|
|
# (max_vlm_input_pil_pixels in pipeline_boogu.py).
|
|
total = int(384 * 384)
|
|
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
|
width = round(samples.shape[3] * scale_by)
|
|
height = round(samples.shape[2] * scale_by)
|
|
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
|
images_vl.append(s.movedim(1, -1)[:, :, :, :3])
|
|
|
|
# Reference latent: align to 16 px (VAE /8 * patch_size 2).
|
|
if vae is not None:
|
|
total = int(1024 * 1024)
|
|
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
|
width = round(samples.shape[3] * scale_by / 16.0) * 16
|
|
height = round(samples.shape[2] * scale_by / 16.0) * 16
|
|
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
|
ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
|
|
|
|
# positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
|
|
positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
|
|
negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt))
|
|
|
|
if len(ref_latents) > 0:
|
|
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
|
|
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
|
|
|
|
return io.NodeOutput(positive, negative)
|
|
|
|
|
|
class BooguExtension(ComfyExtension):
|
|
@override
|
|
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
|
return [
|
|
TextEncodeBooguEdit,
|
|
]
|
|
|
|
|
|
async def comfy_entrypoint() -> BooguExtension:
|
|
return BooguExtension()
|