Fix Qwen3.5 text generation with multiple input images (#13943)

This commit is contained in:
Jukka Seppänen 2026-05-18 08:16:42 +03:00 committed by GitHub
parent aeadb7acaa
commit b39af210d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -760,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
image = kwargs.get("image", None)
if image is not None and len(images) == 0:
images = [image]
images = [image[i:i + 1] for i in range(image.shape[0])]
skip_template = False
if text.startswith('<|im_start|>'):
@ -771,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
if skip_template:
llama_text = text
else:
if llama_template is None:
if len(images) > 0:
llama_text = self.llama_template_images.format(text)
else:
llama_text = self.llama_template.format(text)
if llama_template is not None:
template = llama_template
elif len(images) == 0:
template = self.llama_template
else:
llama_text = llama_template.format(text)
template = self.llama_template_images
if len(images) > 1:
vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
template = template.replace(vision_block, vision_block * len(images), 1)
llama_text = template.format(text)
if not thinking:
llama_text += "<think>\n</think>\n"