From a78d870d499833dbbadceb556c1315e5f882e429 Mon Sep 17 00:00:00 2001
From: Mihail Karaev <karaevmihail@gmail.com>
Date: Mon, 29 Dec 2025 15:03:59 +0000
Subject: [PATCH] Add image to text encoders

---
 comfy/text_encoders/kandinsky5.py |  2 +-
 comfy_extras/nodes_kandinsky5.py  | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/comfy/text_encoders/kandinsky5.py b/comfy/text_encoders/kandinsky5.py
index ec5a0d5f7..d4351f5f6 100644
--- a/comfy/text_encoders/kandinsky5.py
+++ b/comfy/text_encoders/kandinsky5.py
@@ -24,7 +24,7 @@ class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
 class Kandinsky5TokenizerI2I(Kandinsky5Tokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
         super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-        self.llama_template = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
+        self.llama_template_images = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|>\n<|im_start|>user\n{}<|im_end|><|im_start|>assistant\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"
 
 
 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py
index 5c46296c0..3ac1238ab 100644
--- a/comfy_extras/nodes_kandinsky5.py
+++ b/comfy_extras/nodes_kandinsky5.py
@@ -171,6 +171,26 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode):
 
         return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
 
+class TextEncodeQwenKandinskyI2I(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeQwenKandinskyI2I",
+            category="advanced/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Image.Input("image", optional=True),
+            ],
+            outputs=[io.Conditioning.Output()],
+        )
+
+    @classmethod
+    def execute(cls, clip, prompt, image=None) -> io.NodeOutput:
+        images = [image,] if image is not None else []
+        tokens = clip.tokenize(prompt, images=images)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        return io.NodeOutput(conditioning)
 
 class Kandinsky5Extension(ComfyExtension):
     @override
@@ -180,6 +200,7 @@ class Kandinsky5Extension(ComfyExtension):
             Kandinsky5ImageToImage,
             NormalizeVideoLatentStart,
             CLIPTextEncodeKandinsky5,
+            TextEncodeQwenKandinskyI2I,
         ]
 
 async def comfy_entrypoint() -> Kandinsky5Extension: