Fix Qwen3.5 text generation with multiple input images (#13943)

2026-07-03 21:20:49 +08:00 · 2026-05-18 08:16:42 +03:00 · 2026-05-18 08:16:42 +03:00 · b39af210d0
commit b39af210d0
parent aeadb7acaa
1 changed files with 10 additions and 7 deletions
--- a/comfy/text_encoders/qwen35.py
+++ b/comfy/text_encoders/qwen35.py
@ -760,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
        image = kwargs.get("image", None)
        if image is not None and len(images) == 0:
-            images = [image]
+            images = [image[i:i + 1] for i in range(image.shape[0])]

        skip_template = False
        if text.startswith('<|im_start|>'):
@ -771,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
        if skip_template:
            llama_text = text
        else:
-            if llama_template is None:
-                if len(images) > 0:
-                    llama_text = self.llama_template_images.format(text)
-                else:
-                    llama_text = self.llama_template.format(text)
+            if llama_template is not None:
+                template = llama_template
+            elif len(images) == 0:
+                template = self.llama_template
            else:
-                llama_text = llama_template.format(text)
+                template = self.llama_template_images
+                if len(images) > 1:
+                    vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
+                    template = template.replace(vision_block, vision_block * len(images), 1)
+            llama_text = template.format(text)
            if not thinking:
                llama_text += "<think>\n</think>\n"