Fix Qwen3.5 text generation with multiple input images (#13943)

This commit is contained in:
Jukka Seppänen 2026-05-18 08:16:42 +03:00 committed by GitHub
parent aeadb7acaa
commit b39af210d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -760,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs): def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
image = kwargs.get("image", None) image = kwargs.get("image", None)
if image is not None and len(images) == 0: if image is not None and len(images) == 0:
images = [image] images = [image[i:i + 1] for i in range(image.shape[0])]
skip_template = False skip_template = False
if text.startswith('<|im_start|>'): if text.startswith('<|im_start|>'):
@ -771,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
if skip_template: if skip_template:
llama_text = text llama_text = text
else: else:
if llama_template is None: if llama_template is not None:
if len(images) > 0: template = llama_template
llama_text = self.llama_template_images.format(text) elif len(images) == 0:
else: template = self.llama_template
llama_text = self.llama_template.format(text)
else: else:
llama_text = llama_template.format(text) template = self.llama_template_images
if len(images) > 1:
vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
template = template.replace(vision_block, vision_block * len(images), 1)
llama_text = template.format(text)
if not thinking: if not thinking:
llama_text += "<think>\n</think>\n" llama_text += "<think>\n</think>\n"