diff --git a/comfy/text_encoders/qwen35.py b/comfy/text_encoders/qwen35.py
index b022009b1..416ce9d18 100644
--- a/comfy/text_encoders/qwen35.py
+++ b/comfy/text_encoders/qwen35.py
@@ -760,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
     def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
         image = kwargs.get("image", None)
         if image is not None and len(images) == 0:
-            images = [image]
+            images = [image[i:i + 1] for i in range(image.shape[0])]
 
         skip_template = False
         if text.startswith('<|im_start|>'):
@@ -771,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
         if skip_template:
             llama_text = text
         else:
-            if llama_template is None:
-                if len(images) > 0:
-                    llama_text = self.llama_template_images.format(text)
-                else:
-                    llama_text = self.llama_template.format(text)
+            if llama_template is not None:
+                template = llama_template
+            elif len(images) == 0:
+                template = self.llama_template
             else:
-                llama_text = llama_template.format(text)
+                template = self.llama_template_images
+                if len(images) > 1:
+                    vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
+                    template = template.replace(vision_block, vision_block * len(images), 1)
+            llama_text = template.format(text)
             if not thinking:
                 llama_text += "<think>\n</think>\n"