diff --git a/comfy/language/language_types.py b/comfy/language/language_types.py index 448a67d1c..7a9fe376e 100644 --- a/comfy/language/language_types.py +++ b/comfy/language/language_types.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Union, Callable, List, Optional, Protocol, runtime_checkable +from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal import numpy as np import torch @@ -63,6 +63,27 @@ LLaVAProcessor = Callable[ ] +class LanguageMessage(TypedDict): + role: Literal["system", "user", "assistant"] + content: str | MessageContent + + +class MessageContentImage(TypedDict): + url: NotRequired[str] + + +class MessageContent(TypedDict): + type: Literal["text", "image", "video", "image_url"] + text: NotRequired[str] + image: NotRequired[str] + image_url: NotRequired[MessageContentImage] + min_pixels: NotRequired[int] + max_pixels: NotRequired[int] + + +LanguagePrompt = list[LanguageMessage] + + @runtime_checkable class LanguageModel(Protocol): @staticmethod @@ -78,7 +99,7 @@ class LanguageModel(Protocol): **kwargs) -> str: ... - def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult: + def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult: ... @property diff --git a/comfy/language/transformers_model_management.py b/comfy/language/transformers_model_management.py index 9ee4a53c5..f4db3800d 100644 --- a/comfy/language/transformers_model_management.py +++ b/comfy/language/transformers_model_management.py @@ -7,7 +7,7 @@ import operator import pathlib import warnings from functools import reduce -from typing import Optional, Any, Callable, List +from typing import Optional, Any, Callable import torch from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \ @@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN from .chat_templates import KNOWN_CHAT_TEMPLATES from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \ - LLaVAProcessor, LanguageModel + LLaVAProcessor, LanguageModel, LanguagePrompt from .. import model_management +from ..component_model.tensor_types import RGBImageBatch from ..model_downloader import get_or_download_huggingface_repo from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu from ..model_management_types import ModelManageable -from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block +from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil class TransformersManagedModel(ModelManageable, LanguageModel): @@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel): if subfolder is not None and subfolder != "": hub_kwargs["subfolder"] = subfolder repo_id = ckpt_name - ckpt_name = get_or_download_huggingface_repo(ckpt_name) with comfy_tqdm(): + ckpt_name = get_or_download_huggingface_repo(ckpt_name) + from_pretrained_kwargs = { "pretrained_model_name_or_path": ckpt_name, "trust_remote_code": True, @@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel): if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"): processor.image_processor.do_rescale = False - def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult: + def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult: tokenizer = self.tokenizer assert tokenizer is not None assert hasattr(tokenizer, "decode") @@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel): if len(candidate_chat_templates) > 0: filename, chat_template = candidate_chat_templates[0] logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}") + if isinstance(images, list): + images = torch.stack(images, dim=0) + if images is not None: + # PIL it for the sake of simplicity + image_sizes = [(image.shape[-2], image.shape[-3]) for image in images] + else: + image_sizes = [] + images = [] + try: if hasattr(tokenizer, "apply_chat_template"): - # todo: this should come from node inputs - prompt = tokenizer.apply_chat_template([ - {"role": "user", "content": prompt}, - ], chat_template=chat_template, add_generation_prompt=True, tokenize=False) + messages: LanguagePrompt + if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): + messages = prompt + elif "content[" in chat_template: + messages = [ + {"role": "user", + "content": [ + { + "type": "text", + "text": prompt + } + ] + [ + {"type": "image"} for _ in range(len(images)) + ] + + } + ] + else: + messages = [ + {"role": "user", "content": prompt}, + ] + prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False) except Exception as exc: logging.debug("Could not apply chat template", exc_info=exc) - if self.processor is None: + if self.processor is None and isinstance(prompt, str): batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device) return {**batch_encoding} else: - assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image" if hasattr(self.processor, "to"): self.processor.to(device=self.load_device) - assert "" in prompt.lower(), "You must specify a <image> token inside the prompt for it to be substituted correctly by a HuggingFace processor" - batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt") + batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True) if hasattr(self.processor, "to"): self.processor.to(device=self.offload_device) assert "input_ids" in batch_feature batch_feature.to(device=self.load_device, dtype=self.model_dtype()) # noinspection PyTypeChecker return { - "image_sizes": [(images.shape[-1], image.shape[-2]) for image in images], + "image_sizes": image_sizes, "images": batch_feature["pixel_values"], "inputs": batch_feature["input_ids"], **batch_feature diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py index 5f78da20e..4a6d18dea 100644 --- a/comfy/model_downloader.py +++ b/comfy/model_downloader.py @@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = { 'facebook/nllb-200-distilled-1.3B', 'THUDM/chatglm3-6b', 'roborovski/superprompt-v1', + 'Qwen/Qwen2-VL-7B-Instruct', } KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([ diff --git a/comfy/nodes/base_nodes.py b/comfy/nodes/base_nodes.py index 1813dab35..c61258d8a 100644 --- a/comfy/nodes/base_nodes.py +++ b/comfy/nodes/base_nodes.py @@ -7,9 +7,9 @@ import math import random import logging -from PIL import Image, ImageOps, ImageSequence, ImageFile +from PIL import Image, ImageOps, ImageSequence from PIL.PngImagePlugin import PngInfo -from huggingface_hub import hf_hub_download, snapshot_download +from huggingface_hub import snapshot_download from natsort import natsorted import numpy as np import safetensors.torch diff --git a/comfy/utils.py b/comfy/utils.py index d410c59f5..81827a2ee 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -948,3 +948,11 @@ def seed_for_block(seed): np.random.set_state(numpy_rng_state) if torch.cuda.is_available(): torch.cuda.set_rng_state_all(cuda_rng_state) + + +def pil2tensor(image: Image) -> torch.Tensor: + return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0) + + +def tensor2pil(t_image: torch.Tensor) -> Image: + return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8)) diff --git a/comfy_extras/nodes/nodes_openai.py b/comfy_extras/nodes/nodes_openai.py index 325808286..a9d79cd73 100644 --- a/comfy_extras/nodes/nodes_openai.py +++ b/comfy_extras/nodes/nodes_openai.py @@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam from comfy.cli_args import args from comfy.component_model.tensor_types import RGBImageBatch from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \ - TransformerStreamedProgress + TransformerStreamedProgress, LanguagePrompt from comfy.nodes.package_typing import CustomNode, InputTypes from comfy.utils import comfy_progress, ProgressBar, seed_for_block @@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel): return full_response - def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult: + def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult: # OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is return { "inputs": [prompt], diff --git a/comfy_extras/nodes/nodes_svg.py b/comfy_extras/nodes/nodes_svg.py index 78e50e5dc..7099c7555 100644 --- a/comfy_extras/nodes/nodes_svg.py +++ b/comfy_extras/nodes/nodes_svg.py @@ -7,6 +7,7 @@ import vtracer from PIL import Image from comfy.nodes.package_typing import CustomNode +from comfy.utils import tensor2pil def RGB2RGBA(image: Image, mask: Image) -> Image: @@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image: return Image.merge('RGBA', (R, G, B, mask.convert('L'))) -def pil2tensor(image: Image) -> torch.Tensor: - return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0) - - -def tensor2pil(t_image: torch.Tensor) -> Image: - return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8)) - - class ImageToSVG(CustomNode): @classmethod def INPUT_TYPES(cls): diff --git a/tests/inference/workflows/qwen2-vl-0.json b/tests/inference/workflows/qwen2-vl-0.json new file mode 100644 index 000000000..e3693da0d --- /dev/null +++ b/tests/inference/workflows/qwen2-vl-0.json @@ -0,0 +1,106 @@ +{ + "4": { + "inputs": { + "ckpt_name": [ + "20", + 0 + ], + "subfolder": "" + }, + "class_type": "TransformersLoader", + "_meta": { + "title": "TransformersLoader" + } + }, + "5": { + "inputs": { + "value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg", + "name": "", + "title": "Image to query", + "description": "", + "__required": true + }, + "class_type": "ImageRequestParameter", + "_meta": { + "title": "ImageRequestParameter" + } + }, + "7": { + "inputs": { + "prompt": "Describe the contents of this image.", + "chat_template": "default", + "model": [ + "4", + 0 + ], + "images": [ + "5", + 0 + ] + }, + "class_type": "OneShotInstructTokenize", + "_meta": { + "title": "OneShotInstructTokenize" + } + }, + "9": { + "inputs": { + "max_new_tokens": 512, + "repetition_penalty": 0, + "seed": 2598326659, + "__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>", + "model": [ + "4", + 0 + ], + "tokens": [ + "7", + 0 + ] + }, + "class_type": "TransformersGenerate", + "_meta": { + "title": "TransformersGenerate" + } + }, + "11": { + "inputs": { + "value": [ + "9", + 0 + ], + "filename_prefix": "ComfyUI", + "extension": ".txt", + "output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting." + }, + "class_type": "SaveString", + "_meta": { + "title": "SaveString" + } + }, + "20": { + "inputs": { + "value": "Qwen/Qwen2-VL-7B-Instruct", + "name": "", + "title": "", + "description": "", + "__required": true + }, + "class_type": "StringEnumRequestParameter", + "_meta": { + "title": "StringEnumRequestParameter" + } + }, + "21": { + "inputs": { + "images": [ + "5", + 0 + ] + }, + "class_type": "PreviewImage", + "_meta": { + "title": "Preview Image" + } + } +} \ No newline at end of file