diff --git a/comfy/text_encoders/qwen35.py b/comfy/text_encoders/qwen35.py index b022009b1..416ce9d18 100644 --- a/comfy/text_encoders/qwen35.py +++ b/comfy/text_encoders/qwen35.py @@ -760,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer): def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs): image = kwargs.get("image", None) if image is not None and len(images) == 0: - images = [image] + images = [image[i:i + 1] for i in range(image.shape[0])] skip_template = False if text.startswith('<|im_start|>'): @@ -771,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer): if skip_template: llama_text = text else: - if llama_template is None: - if len(images) > 0: - llama_text = self.llama_template_images.format(text) - else: - llama_text = self.llama_template.format(text) + if llama_template is not None: + template = llama_template + elif len(images) == 0: + template = self.llama_template else: - llama_text = llama_template.format(text) + template = self.llama_template_images + if len(images) > 1: + vision_block = "<|vision_start|><|image_pad|><|vision_end|>" + template = template.replace(vision_block, vision_block * len(images), 1) + llama_text = template.format(text) if not thinking: llama_text += "\n\n"