import node_helpers import comfy.utils from typing_extensions import override from comfy_api.latest import ComfyExtension, io # fmt: off BUCKETS_1024 = [ (512, 1792), (512, 1856), (512, 1920), (512, 1984), (512, 2048), (576, 1600), (576, 1664), (576, 1728), (576, 1792), (640, 1472), (640, 1536), (640, 1600), (704, 1344), (704, 1408), (704, 1472), (768, 1216), (768, 1280), (768, 1344), (832, 1152), (832, 1216), (896, 1088), (896, 1152), (960, 1024), (960, 1088), (1024, 960), (1024, 1024), (1088, 896), (1088, 960), (1152, 832), (1152, 896), (1216, 768), (1216, 832), (1280, 768), (1344, 704), (1344, 768), (1408, 704), (1472, 640), (1472, 704), (1536, 640), (1600, 576), (1600, 640), (1664, 576), (1728, 576), (1792, 512), (1792, 576), (1856, 512), (1920, 512), (1984, 512), (2048, 512), ] # fmt: on def _find_best_bucket(height: int, width: int) -> tuple[int, int]: target_ratio = height / width return min(BUCKETS_1024, key=lambda hw: abs(hw[0] / hw[1] - target_ratio)) class TextEncodeJoyImageEdit(io.ComfyNode): @classmethod def define_schema(cls): return io.Schema( node_id="TextEncodeJoyImageEdit", category="advanced/conditioning", inputs=[ io.Clip.Input("clip"), io.String.Input("prompt", multiline=True, dynamic_prompts=True), io.Vae.Input("vae"), io.Image.Input("image"), ], outputs=[ io.Conditioning.Output(), io.Image.Output(display_name="image"), ], ) @classmethod def execute(cls, clip, prompt, vae, image) -> io.NodeOutput: samples = image.movedim(-1, 1) src_h, src_w = samples.shape[2], samples.shape[3] bucket_h, bucket_w = _find_best_bucket(src_h, src_w) resized = comfy.utils.common_upscale(samples, bucket_w, bucket_h, "bilinear", "center") resized_image = resized.movedim(1, -1)[:, :, :, :3] tokens = clip.tokenize(prompt, images=[resized_image]) conditioning = clip.encode_from_tokens_scheduled(tokens) ref_latent = vae.encode(resized_image) conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True) return io.NodeOutput(conditioning, resized_image) class TextEncodeJoyImageEditPlus(io.ComfyNode): """JoyImageEdit multi-image (Plus) text-encode node. Accepts 1-6 optional reference images. Each supplied image is bucket-resized independently (same buckets/resize as the single-image node), VAE-encoded, and appended in order to ``conditioning["reference_latents"]`` (image1 → ref0, image2 → ref1, ...). All resized images are passed to the VL tower in one call; the tokenizer emits one ``<|vision_start|><|image_pad|><|vision_end|>`` block per image. """ MAX_IMAGES = 6 @classmethod def define_schema(cls): return io.Schema( node_id="TextEncodeJoyImageEditPlus", category="advanced/conditioning", inputs=[ io.Clip.Input("clip"), io.String.Input("prompt", multiline=True, dynamic_prompts=True), io.Vae.Input("vae"), io.Image.Input("image1", optional=True), io.Image.Input("image2", optional=True), io.Image.Input("image3", optional=True), io.Image.Input("image4", optional=True), io.Image.Input("image5", optional=True), io.Image.Input("image6", optional=True), ], outputs=[ io.Conditioning.Output(), io.Image.Output(display_name="image"), ], ) @classmethod def execute(cls, clip, prompt, vae, image1=None, image2=None, image3=None, image4=None, image5=None, image6=None) -> io.NodeOutput: images = [image1, image2, image3, image4, image5, image6] supplied = [img for img in images if img is not None] if len(supplied) == 0: raise ValueError( "TextEncodeJoyImageEditPlus requires at least one reference image." ) resized_images = [] ref_latents = [] for image in supplied: samples = image.movedim(-1, 1) src_h, src_w = samples.shape[2], samples.shape[3] bucket_h, bucket_w = _find_best_bucket(src_h, src_w) resized = comfy.utils.common_upscale(samples, bucket_w, bucket_h, "bilinear", "center") resized_image = resized.movedim(1, -1)[:, :, :, :3] resized_images.append(resized_image) ref_latents.append(vae.encode(resized_image)) tokens = clip.tokenize(prompt, images=resized_images) conditioning = clip.encode_from_tokens_scheduled(tokens) conditioning = node_helpers.conditioning_set_values( conditioning, {"reference_latents": ref_latents}, append=True, ) # The last reference sets the target resolution; return it for VAEEncode and the # matching negative encode. return io.NodeOutput(conditioning, resized_images[-1]) class JoyImageExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ TextEncodeJoyImageEdit, TextEncodeJoyImageEditPlus, ] async def comfy_entrypoint() -> JoyImageExtension: return JoyImageExtension()