Add image to text encoders

2026-03-10 11:47:34 +08:00 · 2025-12-29 15:03:59 +00:00 · 2025-12-29 15:03:59 +00:00 · a78d870d49
commit a78d870d49
parent bfe4b31a32
2 changed files with 22 additions and 1 deletions
--- a/comfy/text_encoders/kandinsky5.py
+++ b/comfy/text_encoders/kandinsky5.py
@ -24,7 +24,7 @@ class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
 class Kandinsky5TokenizerI2I(Kandinsky5Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-        self.llama_template = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
+        self.llama_template_images = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|>\n<|im_start|>user\n{}<|im_end|><|im_start|>assistant\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"


 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
--- a/comfy_extras/nodes_kandinsky5.py
+++ b/comfy_extras/nodes_kandinsky5.py
@ -171,6 +171,26 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode):

        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))

+class TextEncodeQwenKandinskyI2I(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeQwenKandinskyI2I",
+            category="advanced/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Image.Input("image", optional=True),
+            ],
+            outputs=[io.Conditioning.Output()],
+        )
+
+    @classmethod
+    def execute(cls, clip, prompt, image=None) -> io.NodeOutput:
+        images = [image,] if image is not None else []
+        tokens = clip.tokenize(prompt, images=images)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        return io.NodeOutput(conditioning)

 class Kandinsky5Extension(ComfyExtension):
    @override
@ -180,6 +200,7 @@ class Kandinsky5Extension(ComfyExtension):
            Kandinsky5ImageToImage,
            NormalizeVideoLatentStart,
            CLIPTextEncodeKandinsky5,
+            TextEncodeQwenKandinskyI2I,
        ]

 async def comfy_entrypoint() -> Kandinsky5Extension: