Qwen2

2026-03-14 21:57:33 +08:00 · 2024-09-06 17:44:08 -07:00 · 2024-09-06 17:44:08 -07:00 · 25e636fb65
commit 25e636fb65
parent e8eab4dbc6
8 changed files with 184 additions and 28 deletions
--- a/comfy/language/language_types.py
+++ b/comfy/language/language_types.py
@ -1,6 +1,6 @@
 from __future__ import annotations

-from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
+from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal

 import numpy as np
 import torch
@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
 ]


+class LanguageMessage(TypedDict):
+    role: Literal["system", "user", "assistant"]
+    content: str | MessageContent
+
+
+class MessageContentImage(TypedDict):
+    url: NotRequired[str]
+
+
+class MessageContent(TypedDict):
+    type: Literal["text", "image", "video", "image_url"]
+    text: NotRequired[str]
+    image: NotRequired[str]
+    image_url: NotRequired[MessageContentImage]
+    min_pixels: NotRequired[int]
+    max_pixels: NotRequired[int]
+
+
+LanguagePrompt = list[LanguageMessage]
+
+
@runtime_checkable
 class LanguageModel(Protocol):
    @staticmethod
@ -78,7 +99,7 @@ class LanguageModel(Protocol):
                 **kwargs) -> str:
        ...

-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        ...

    @property
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@ -7,7 +7,7 @@ import operator
 import pathlib
 import warnings
 from functools import reduce
-from typing import Optional, Any, Callable, List
+from typing import Optional, Any, Callable

 import torch
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN

 from .chat_templates import KNOWN_CHAT_TEMPLATES
 from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
-    LLaVAProcessor, LanguageModel
+    LLaVAProcessor, LanguageModel, LanguagePrompt
 from .. import model_management
+from ..component_model.tensor_types import RGBImageBatch
 from ..model_downloader import get_or_download_huggingface_repo
 from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
 from ..model_management_types import ModelManageable
-from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
+from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil


 class TransformersManagedModel(ModelManageable, LanguageModel):
@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
        if subfolder is not None and subfolder != "":
            hub_kwargs["subfolder"] = subfolder
        repo_id = ckpt_name
-        ckpt_name = get_or_download_huggingface_repo(ckpt_name)
        with comfy_tqdm():
+            ckpt_name = get_or_download_huggingface_repo(ckpt_name)
+
            from_pretrained_kwargs = {
                "pretrained_model_name_or_path": ckpt_name,
                "trust_remote_code": True,
@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
        if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
            processor.image_processor.do_rescale = False

-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        tokenizer = self.tokenizer
        assert tokenizer is not None
        assert hasattr(tokenizer, "decode")
@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
            if len(candidate_chat_templates) > 0:
                filename, chat_template = candidate_chat_templates[0]
                logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
+        if isinstance(images, list):
+            images = torch.stack(images, dim=0)
+        if images is not None:
+            # PIL it for the sake of simplicity
+            image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
+        else:
+            image_sizes = []
+            images = []
+
        try:
            if hasattr(tokenizer, "apply_chat_template"):
-                # todo: this should come from node inputs
-                prompt = tokenizer.apply_chat_template([
-                    {"role": "user", "content": prompt},
-                ], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
+                messages: LanguagePrompt
+                if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
+                    messages = prompt
+                elif "content[" in chat_template:
+                    messages = [
+                        {"role": "user",
+                         "content": [
+                                        {
+                                            "type": "text",
+                                            "text": prompt
+                                        }
+                                    ] + [
+                                        {"type": "image"} for _ in range(len(images))
+                                    ]
+
+                         }
+                    ]
+                else:
+                    messages = [
+                        {"role": "user", "content": prompt},
+                    ]
+                prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
        except Exception as exc:
            logging.debug("Could not apply chat template", exc_info=exc)

-        if self.processor is None:
+        if self.processor is None and isinstance(prompt, str):
            batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
            return {**batch_encoding}
        else:
-            assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
            if hasattr(self.processor, "to"):
                self.processor.to(device=self.load_device)

-            assert "<image>" in prompt.lower(), "You must specify a &lt;image&gt; token inside the prompt for it to be substituted correctly by a HuggingFace processor"
-            batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
+            batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
            if hasattr(self.processor, "to"):
                self.processor.to(device=self.offload_device)
            assert "input_ids" in batch_feature
            batch_feature.to(device=self.load_device, dtype=self.model_dtype())
            # noinspection PyTypeChecker
            return {
-                "image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
+                "image_sizes": image_sizes,
                "images": batch_feature["pixel_values"],
                "inputs": batch_feature["input_ids"],
                **batch_feature
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
    'facebook/nllb-200-distilled-1.3B',
    'THUDM/chatglm3-6b',
    'roborovski/superprompt-v1',
+    'Qwen/Qwen2-VL-7B-Instruct',
 }

 KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@ -7,9 +7,9 @@ import math
 import random
 import logging

-from PIL import Image, ImageOps, ImageSequence, ImageFile
+from PIL import Image, ImageOps, ImageSequence
 from PIL.PngImagePlugin import PngInfo
-from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub import snapshot_download
 from natsort import natsorted
 import numpy as np
 import safetensors.torch
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -948,3 +948,11 @@ def seed_for_block(seed):
        np.random.set_state(numpy_rng_state)
        if torch.cuda.is_available():
            torch.cuda.set_rng_state_all(cuda_rng_state)
+
+
+def pil2tensor(image: Image) -> torch.Tensor:
+    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
+
+
+def tensor2pil(t_image: torch.Tensor) -> Image:
+    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
--- a/comfy_extras/nodes/nodes_openai.py
+++ b/comfy_extras/nodes/nodes_openai.py
@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
 from comfy.cli_args import args
 from comfy.component_model.tensor_types import RGBImageBatch
 from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
-    TransformerStreamedProgress
+    TransformerStreamedProgress, LanguagePrompt
 from comfy.nodes.package_typing import CustomNode, InputTypes
 from comfy.utils import comfy_progress, ProgressBar, seed_for_block

@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):

        return full_response

-    def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        # OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
        return {
            "inputs": [prompt],
--- a/comfy_extras/nodes/nodes_svg.py
+++ b/comfy_extras/nodes/nodes_svg.py
@ -7,6 +7,7 @@ import vtracer
 from PIL import Image

 from comfy.nodes.package_typing import CustomNode
+from comfy.utils import tensor2pil


 def RGB2RGBA(image: Image, mask: Image) -> Image:
@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
    return Image.merge('RGBA', (R, G, B, mask.convert('L')))


-def pil2tensor(image: Image) -> torch.Tensor:
-    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
-
-
-def tensor2pil(t_image: torch.Tensor) -> Image:
-    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
-
-
 class ImageToSVG(CustomNode):
    @classmethod
    def INPUT_TYPES(cls):
--- a/tests/inference/workflows/qwen2-vl-0.json
+++ b/tests/inference/workflows/qwen2-vl-0.json
@ -0,0 +1,106 @@
+{
+  "4": {
+    "inputs": {
+      "ckpt_name": [
+        "20",
+        0
+      ],
+      "subfolder": ""
+    },
+    "class_type": "TransformersLoader",
+    "_meta": {
+      "title": "TransformersLoader"
+    }
+  },
+  "5": {
+    "inputs": {
+      "value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
+      "name": "",
+      "title": "Image to query",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  },
+  "7": {
+    "inputs": {
+      "prompt": "Describe the contents of this image.",
+      "chat_template": "default",
+      "model": [
+        "4",
+        0
+      ],
+      "images": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "OneShotInstructTokenize",
+    "_meta": {
+      "title": "OneShotInstructTokenize"
+    }
+  },
+  "9": {
+    "inputs": {
+      "max_new_tokens": 512,
+      "repetition_penalty": 0,
+      "seed": 2598326659,
+      "__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
+      "model": [
+        "4",
+        0
+      ],
+      "tokens": [
+        "7",
+        0
+      ]
+    },
+    "class_type": "TransformersGenerate",
+    "_meta": {
+      "title": "TransformersGenerate"
+    }
+  },
+  "11": {
+    "inputs": {
+      "value": [
+        "9",
+        0
+      ],
+      "filename_prefix": "ComfyUI",
+      "extension": ".txt",
+      "output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
+    },
+    "class_type": "SaveString",
+    "_meta": {
+      "title": "SaveString"
+    }
+  },
+  "20": {
+    "inputs": {
+      "value": "Qwen/Qwen2-VL-7B-Instruct",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "StringEnumRequestParameter",
+    "_meta": {
+      "title": "StringEnumRequestParameter"
+    }
+  },
+  "21": {
+    "inputs": {
+      "images": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "PreviewImage",
+    "_meta": {
+      "title": "Preview Image"
+    }
+  }
+}