Qwen2

2026-03-15 06:07:33 +08:00 · 2024-09-06 17:44:08 -07:00 · 2024-09-06 17:44:08 -07:00 · 25e636fb65
commit 25e636fb65
parent e8eab4dbc6
8 changed files with 184 additions and 28 deletions
--- a/comfy/language/language_types.py
+++ b/comfy/language/language_types.py
@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
+from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal
 import numpy as np
 import torch
@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
 ]
 class LanguageMessage(TypedDict):
    role: Literal["system", "user", "assistant"]
    content: str | MessageContent
 class MessageContentImage(TypedDict):
    url: NotRequired[str]
 class MessageContent(TypedDict):
    type: Literal["text", "image", "video", "image_url"]
    text: NotRequired[str]
    image: NotRequired[str]
    image_url: NotRequired[MessageContentImage]
    min_pixels: NotRequired[int]
    max_pixels: NotRequired[int]
 LanguagePrompt = list[LanguageMessage]
@runtime_checkable
 class LanguageModel(Protocol):
    @staticmethod
@ -78,7 +99,7 @@ class LanguageModel(Protocol):
                 **kwargs) -> str:
        ...
-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        ...
    @property
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@ -7,7 +7,7 @@ import operator
 import pathlib
 import warnings
 from functools import reduce
-from typing import Optional, Any, Callable, List
+from typing import Optional, Any, Callable
 import torch
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN
 from .chat_templates import KNOWN_CHAT_TEMPLATES
 from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
-    LLaVAProcessor, LanguageModel
+    LLaVAProcessor, LanguageModel, LanguagePrompt
 from .. import model_management
 from ..component_model.tensor_types import RGBImageBatch
 from ..model_downloader import get_or_download_huggingface_repo
 from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
 from ..model_management_types import ModelManageable
-from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
+from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil
 class TransformersManagedModel(ModelManageable, LanguageModel):
@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
        if subfolder is not None and subfolder != "":
            hub_kwargs["subfolder"] = subfolder
        repo_id = ckpt_name
        ckpt_name = get_or_download_huggingface_repo(ckpt_name)
        with comfy_tqdm():
            ckpt_name = get_or_download_huggingface_repo(ckpt_name)
            from_pretrained_kwargs = {
                "pretrained_model_name_or_path": ckpt_name,
                "trust_remote_code": True,
@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
        if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
            processor.image_processor.do_rescale = False
-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        tokenizer = self.tokenizer
        assert tokenizer is not None
        assert hasattr(tokenizer, "decode")
@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
            if len(candidate_chat_templates) > 0:
                filename, chat_template = candidate_chat_templates[0]
                logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
        if isinstance(images, list):
            images = torch.stack(images, dim=0)
        if images is not None:
            # PIL it for the sake of simplicity
            image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
        else:
            image_sizes = []
            images = []
        try:
            if hasattr(tokenizer, "apply_chat_template"):
-                # todo: this should come from node inputs
+                messages: LanguagePrompt
-                prompt = tokenizer.apply_chat_template([
+                if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
-                    {"role": "user", "content": prompt},
+                    messages = prompt
-                ], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
+                elif "content[" in chat_template:
                    messages = [
                        {"role": "user",
                         "content": [
                                        {
                                            "type": "text",
                                            "text": prompt
                                        }
                                    ] + [
                                        {"type": "image"} for _ in range(len(images))
                                    ]
                         }
                    ]
                else:
                    messages = [
                        {"role": "user", "content": prompt},
                    ]
                prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
        except Exception as exc:
            logging.debug("Could not apply chat template", exc_info=exc)
-        if self.processor is None:
+        if self.processor is None and isinstance(prompt, str):
            batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
            return {**batch_encoding}
        else:
            assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
            if hasattr(self.processor, "to"):
                self.processor.to(device=self.load_device)
-            assert "<image>" in prompt.lower(), "You must specify a &lt;image&gt; token inside the prompt for it to be substituted correctly by a HuggingFace processor"
+            batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
            batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
            if hasattr(self.processor, "to"):
                self.processor.to(device=self.offload_device)
            assert "input_ids" in batch_feature
            batch_feature.to(device=self.load_device, dtype=self.model_dtype())
            # noinspection PyTypeChecker
            return {
-                "image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
+                "image_sizes": image_sizes,
                "images": batch_feature["pixel_values"],
                "inputs": batch_feature["input_ids"],
                **batch_feature
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
    'facebook/nllb-200-distilled-1.3B',
    'THUDM/chatglm3-6b',
    'roborovski/superprompt-v1',
    'Qwen/Qwen2-VL-7B-Instruct',
 }
 KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@ -7,9 +7,9 @@ import math
 import random
 import logging
-from PIL import Image, ImageOps, ImageSequence, ImageFile
+from PIL import Image, ImageOps, ImageSequence
 from PIL.PngImagePlugin import PngInfo
-from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub import snapshot_download
 from natsort import natsorted
 import numpy as np
 import safetensors.torch
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -948,3 +948,11 @@ def seed_for_block(seed):
        np.random.set_state(numpy_rng_state)
        if torch.cuda.is_available():
            torch.cuda.set_rng_state_all(cuda_rng_state)
 def pil2tensor(image: Image) -> torch.Tensor:
    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
 def tensor2pil(t_image: torch.Tensor) -> Image:
    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
--- a/comfy_extras/nodes/nodes_openai.py
+++ b/comfy_extras/nodes/nodes_openai.py
@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
 from comfy.cli_args import args
 from comfy.component_model.tensor_types import RGBImageBatch
 from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
-    TransformerStreamedProgress
+    TransformerStreamedProgress, LanguagePrompt
 from comfy.nodes.package_typing import CustomNode, InputTypes
 from comfy.utils import comfy_progress, ProgressBar, seed_for_block
@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):
        return full_response
-    def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
        # OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
        return {
            "inputs": [prompt],
--- a/comfy_extras/nodes/nodes_svg.py
+++ b/comfy_extras/nodes/nodes_svg.py
@ -7,6 +7,7 @@ import vtracer
 from PIL import Image
 from comfy.nodes.package_typing import CustomNode
 from comfy.utils import tensor2pil
 def RGB2RGBA(image: Image, mask: Image) -> Image:
@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
    return Image.merge('RGBA', (R, G, B, mask.convert('L')))
 def pil2tensor(image: Image) -> torch.Tensor:
    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
 def tensor2pil(t_image: torch.Tensor) -> Image:
    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
 class ImageToSVG(CustomNode):
    @classmethod
    def INPUT_TYPES(cls):
--- a/tests/inference/workflows/qwen2-vl-0.json
+++ b/tests/inference/workflows/qwen2-vl-0.json
@ -0,0 +1,106 @@
 {
  "4": {
    "inputs": {
      "ckpt_name": [
        "20",
        0
      ],
      "subfolder": ""
    },
    "class_type": "TransformersLoader",
    "_meta": {
      "title": "TransformersLoader"
    }
  },
  "5": {
    "inputs": {
      "value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
      "name": "",
      "title": "Image to query",
      "description": "",
      "__required": true
    },
    "class_type": "ImageRequestParameter",
    "_meta": {
      "title": "ImageRequestParameter"
    }
  },
  "7": {
    "inputs": {
      "prompt": "Describe the contents of this image.",
      "chat_template": "default",
      "model": [
        "4",
        0
      ],
      "images": [
        "5",
        0
      ]
    },
    "class_type": "OneShotInstructTokenize",
    "_meta": {
      "title": "OneShotInstructTokenize"
    }
  },
  "9": {
    "inputs": {
      "max_new_tokens": 512,
      "repetition_penalty": 0,
      "seed": 2598326659,
      "__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
      "model": [
        "4",
        0
      ],
      "tokens": [
        "7",
        0
      ]
    },
    "class_type": "TransformersGenerate",
    "_meta": {
      "title": "TransformersGenerate"
    }
  },
  "11": {
    "inputs": {
      "value": [
        "9",
        0
      ],
      "filename_prefix": "ComfyUI",
      "extension": ".txt",
      "output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
    },
    "class_type": "SaveString",
    "_meta": {
      "title": "SaveString"
    }
  },
  "20": {
    "inputs": {
      "value": "Qwen/Qwen2-VL-7B-Instruct",
      "name": "",
      "title": "",
      "description": "",
      "__required": true
    },
    "class_type": "StringEnumRequestParameter",
    "_meta": {
      "title": "StringEnumRequestParameter"
    }
  },
  "21": {
    "inputs": {
      "images": [
        "5",
        0
      ]
    },
    "class_type": "PreviewImage",
    "_meta": {
      "title": "Preview Image"
    }
  }
 }