From bfe4b31a3295cf93dea14c8a8aa6723a15a85af1 Mon Sep 17 00:00:00 2001
From: Mihail Karaev <karaevmihail@gmail.com>
Date: Tue, 23 Dec 2025 15:22:48 +0000
Subject: [PATCH] Add i2i pipeline

---
 comfy/model_base.py               | 30 ++++++++++++++++++++
 comfy/sd.py                       |  4 +++
 comfy/supported_models.py         | 25 +++++++++++++++-
 comfy/text_encoders/kandinsky5.py |  5 ++++
 comfy_extras/nodes_kandinsky5.py  | 47 +++++++++++++++++++++++++++++++
 nodes.py                          |  2 +-
 6 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/comfy/model_base.py b/comfy/model_base.py
index 6b8a8454d..d69328deb 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1681,3 +1681,33 @@ class Kandinsky5Image(Kandinsky5):
 
     def concat_cond(self, **kwargs):
         return None
+    
+class Kandinsky5ImageToImage(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(
+            model_config,
+            model_type,
+            device=device,
+            unet_model=comfy.ldm.kandinsky5.model.Kandinsky5
+        )
+
+    def encode_adm(self, **kwargs):
+        return kwargs["pooled_output"]
+
+    def concat_cond(self, **kwargs):
+        noise  = kwargs["noise"]
+        device = kwargs["device"]
+        image = kwargs.get("latent_image", None)
+        image = utils.resize_to_batch_size(image, noise.shape[0])
+        mask_ones = torch.ones_like(noise)[:, :1].to(device=device)
+        return torch.cat((image, mask_ones), dim=1)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out["attention_mask"] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out["c_crossattn"] = comfy.conds.CONDRegular(cross_attn)
+        return out
\ No newline at end of file
diff --git a/comfy/sd.py b/comfy/sd.py
index 1cad98aef..bf9180b21 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -992,6 +992,7 @@ class CLIPType(Enum):
     OVIS = 21
     KANDINSKY5 = 22
     KANDINSKY5_IMAGE = 23
+    KANDINSKY5_I2I = 24
 
 
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -1246,6 +1247,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         elif clip_type == CLIPType.KANDINSKY5_IMAGE:
             clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
             clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage
+        elif clip_type == CLIPType.KANDINSKY5_I2I:
+            clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerI2I
         else:
             clip_target.clip = sdxl_clip.SDXLClipModel
             clip_target.tokenizer = sdxl_clip.SDXLTokenizer
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 1888f35ba..ad3c3b40b 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1535,7 +1535,30 @@ class Kandinsky5Image(Kandinsky5):
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
 
+class Kandinsky5ImageToImage(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "kandinsky5",
+        "model_dim": 2560,
+        "visual_embed_dim": 132,
+    }
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    latent_format = latent_formats.Flux
+    memory_usage_factor = 1.25 #TODO
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Kandinsky5ImageToImage(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerI2I, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
+
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5ImageToImage, Kandinsky5Image, Kandinsky5]
 
 models += [SVD_img2vid]
diff --git a/comfy/text_encoders/kandinsky5.py b/comfy/text_encoders/kandinsky5.py
index be086458c..ec5a0d5f7 100644
--- a/comfy/text_encoders/kandinsky5.py
+++ b/comfy/text_encoders/kandinsky5.py
@@ -21,6 +21,11 @@ class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
         super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
         self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
 
+class Kandinsky5TokenizerI2I(Kandinsky5Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.llama_template = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
+
 
 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
     def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}):
diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py
index aaaf83566..5c46296c0 100644
--- a/comfy_extras/nodes_kandinsky5.py
+++ b/comfy_extras/nodes_kandinsky5.py
@@ -1,6 +1,7 @@
 import nodes
 import node_helpers
 import torch
+import torchvision.transforms.functional as F
 import comfy.model_management
 import comfy.utils
 
@@ -55,6 +56,51 @@ class Kandinsky5ImageToVideo(io.ComfyNode):
         return io.NodeOutput(positive, negative, out_latent, cond_latent_out)
 
 
+
+class Kandinsky5ImageToImage(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="Kandinsky5ImageToImage",
+            category="image",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("start_image"),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent", tooltip="Empty video latent"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, batch_size, start_image) -> io.NodeOutput:
+        height, width = start_image.shape[1:-1]
+        
+        available_res = [(1024, 1024), (640, 1408), (1408, 640), (768, 1280), (1280, 768), (896, 1152), (1152, 896)]
+        nearest_index = torch.argmin(torch.Tensor([abs((w / h) - (width / height))for (w, h) in available_res]))
+        nw, nh = available_res[nearest_index]
+        scale_factor = min(height / nh, width / nw)
+        start_image = start_image.permute(0,3,1,2)
+        start_image = F.resize(start_image, (int(height / scale_factor), int(width / scale_factor)))
+        start_image = F.crop(
+            start_image,
+            (height - nh) // 2,
+            (width - nw) // 2,
+            nh,
+            nw,
+        )
+        print(start_image.shape)
+        start_image = start_image.permute(0,2,3,1)
+        encoded = vae.encode(start_image[:, :, :, :3])
+        out_latent = {"samples": encoded.repeat(batch_size, 1, 1, 1)}
+        return io.NodeOutput(positive, negative, out_latent)
+    
+
 def adaptive_mean_std_normalization(source, reference, clump_mean_low=0.3, clump_mean_high=0.35, clump_std_low=0.35, clump_std_high=0.5):
     source_mean = source.mean(dim=(1, 3, 4), keepdim=True)  # mean over C, H, W
     source_std = source.std(dim=(1, 3, 4), keepdim=True)    # std over C, H, W
@@ -131,6 +177,7 @@ class Kandinsky5Extension(ComfyExtension):
     async def get_node_list(self) -> list[type[io.ComfyNode]]:
         return [
             Kandinsky5ImageToVideo,
+            Kandinsky5ImageToImage,
             NormalizeVideoLatentStart,
             CLIPTextEncodeKandinsky5,
         ]
diff --git a/nodes.py b/nodes.py
index 3fa543294..d422a3b00 100644
--- a/nodes.py
+++ b/nodes.py
@@ -970,7 +970,7 @@ class DualCLIPLoader:
     def INPUT_TYPES(s):
         return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                               "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image", "kandinsky5_i2i"], ),
                               },
                 "optional": {
                               "device": (["default", "cpu"], {"advanced": True}),