mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-10 06:10:50 +08:00
Qwen2
This commit is contained in:
parent
e8eab4dbc6
commit
25e636fb65
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
|
||||
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
|
||||
]
|
||||
|
||||
|
||||
class LanguageMessage(TypedDict):
|
||||
role: Literal["system", "user", "assistant"]
|
||||
content: str | MessageContent
|
||||
|
||||
|
||||
class MessageContentImage(TypedDict):
|
||||
url: NotRequired[str]
|
||||
|
||||
|
||||
class MessageContent(TypedDict):
|
||||
type: Literal["text", "image", "video", "image_url"]
|
||||
text: NotRequired[str]
|
||||
image: NotRequired[str]
|
||||
image_url: NotRequired[MessageContentImage]
|
||||
min_pixels: NotRequired[int]
|
||||
max_pixels: NotRequired[int]
|
||||
|
||||
|
||||
LanguagePrompt = list[LanguageMessage]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class LanguageModel(Protocol):
|
||||
@staticmethod
|
||||
@ -78,7 +99,7 @@ class LanguageModel(Protocol):
|
||||
**kwargs) -> str:
|
||||
...
|
||||
|
||||
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
...
|
||||
|
||||
@property
|
||||
|
||||
@ -7,7 +7,7 @@ import operator
|
||||
import pathlib
|
||||
import warnings
|
||||
from functools import reduce
|
||||
from typing import Optional, Any, Callable, List
|
||||
from typing import Optional, Any, Callable
|
||||
|
||||
import torch
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
|
||||
@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN
|
||||
|
||||
from .chat_templates import KNOWN_CHAT_TEMPLATES
|
||||
from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
|
||||
LLaVAProcessor, LanguageModel
|
||||
LLaVAProcessor, LanguageModel, LanguagePrompt
|
||||
from .. import model_management
|
||||
from ..component_model.tensor_types import RGBImageBatch
|
||||
from ..model_downloader import get_or_download_huggingface_repo
|
||||
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
||||
from ..model_management_types import ModelManageable
|
||||
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
|
||||
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil
|
||||
|
||||
|
||||
class TransformersManagedModel(ModelManageable, LanguageModel):
|
||||
@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
||||
if subfolder is not None and subfolder != "":
|
||||
hub_kwargs["subfolder"] = subfolder
|
||||
repo_id = ckpt_name
|
||||
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
|
||||
with comfy_tqdm():
|
||||
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
|
||||
|
||||
from_pretrained_kwargs = {
|
||||
"pretrained_model_name_or_path": ckpt_name,
|
||||
"trust_remote_code": True,
|
||||
@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
||||
if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
|
||||
processor.image_processor.do_rescale = False
|
||||
|
||||
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
tokenizer = self.tokenizer
|
||||
assert tokenizer is not None
|
||||
assert hasattr(tokenizer, "decode")
|
||||
@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
||||
if len(candidate_chat_templates) > 0:
|
||||
filename, chat_template = candidate_chat_templates[0]
|
||||
logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
|
||||
if isinstance(images, list):
|
||||
images = torch.stack(images, dim=0)
|
||||
if images is not None:
|
||||
# PIL it for the sake of simplicity
|
||||
image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
|
||||
else:
|
||||
image_sizes = []
|
||||
images = []
|
||||
|
||||
try:
|
||||
if hasattr(tokenizer, "apply_chat_template"):
|
||||
# todo: this should come from node inputs
|
||||
prompt = tokenizer.apply_chat_template([
|
||||
{"role": "user", "content": prompt},
|
||||
], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
|
||||
messages: LanguagePrompt
|
||||
if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
|
||||
messages = prompt
|
||||
elif "content[" in chat_template:
|
||||
messages = [
|
||||
{"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
] + [
|
||||
{"type": "image"} for _ in range(len(images))
|
||||
]
|
||||
|
||||
}
|
||||
]
|
||||
else:
|
||||
messages = [
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
|
||||
except Exception as exc:
|
||||
logging.debug("Could not apply chat template", exc_info=exc)
|
||||
|
||||
if self.processor is None:
|
||||
if self.processor is None and isinstance(prompt, str):
|
||||
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
||||
return {**batch_encoding}
|
||||
else:
|
||||
assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
|
||||
if hasattr(self.processor, "to"):
|
||||
self.processor.to(device=self.load_device)
|
||||
|
||||
assert "<image>" in prompt.lower(), "You must specify a <image> token inside the prompt for it to be substituted correctly by a HuggingFace processor"
|
||||
batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
|
||||
batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
|
||||
if hasattr(self.processor, "to"):
|
||||
self.processor.to(device=self.offload_device)
|
||||
assert "input_ids" in batch_feature
|
||||
batch_feature.to(device=self.load_device, dtype=self.model_dtype())
|
||||
# noinspection PyTypeChecker
|
||||
return {
|
||||
"image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
|
||||
"image_sizes": image_sizes,
|
||||
"images": batch_feature["pixel_values"],
|
||||
"inputs": batch_feature["input_ids"],
|
||||
**batch_feature
|
||||
|
||||
@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
|
||||
'facebook/nllb-200-distilled-1.3B',
|
||||
'THUDM/chatglm3-6b',
|
||||
'roborovski/superprompt-v1',
|
||||
'Qwen/Qwen2-VL-7B-Instruct',
|
||||
}
|
||||
|
||||
KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||
|
||||
@ -7,9 +7,9 @@ import math
|
||||
import random
|
||||
import logging
|
||||
|
||||
from PIL import Image, ImageOps, ImageSequence, ImageFile
|
||||
from PIL import Image, ImageOps, ImageSequence
|
||||
from PIL.PngImagePlugin import PngInfo
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
from huggingface_hub import snapshot_download
|
||||
from natsort import natsorted
|
||||
import numpy as np
|
||||
import safetensors.torch
|
||||
|
||||
@ -948,3 +948,11 @@ def seed_for_block(seed):
|
||||
np.random.set_state(numpy_rng_state)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.set_rng_state_all(cuda_rng_state)
|
||||
|
||||
|
||||
def pil2tensor(image: Image) -> torch.Tensor:
|
||||
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
|
||||
|
||||
|
||||
def tensor2pil(t_image: torch.Tensor) -> Image:
|
||||
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
|
||||
|
||||
@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
|
||||
from comfy.cli_args import args
|
||||
from comfy.component_model.tensor_types import RGBImageBatch
|
||||
from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
|
||||
TransformerStreamedProgress
|
||||
TransformerStreamedProgress, LanguagePrompt
|
||||
from comfy.nodes.package_typing import CustomNode, InputTypes
|
||||
from comfy.utils import comfy_progress, ProgressBar, seed_for_block
|
||||
|
||||
@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):
|
||||
|
||||
return full_response
|
||||
|
||||
def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
|
||||
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||
# OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
|
||||
return {
|
||||
"inputs": [prompt],
|
||||
|
||||
@ -7,6 +7,7 @@ import vtracer
|
||||
from PIL import Image
|
||||
|
||||
from comfy.nodes.package_typing import CustomNode
|
||||
from comfy.utils import tensor2pil
|
||||
|
||||
|
||||
def RGB2RGBA(image: Image, mask: Image) -> Image:
|
||||
@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
|
||||
return Image.merge('RGBA', (R, G, B, mask.convert('L')))
|
||||
|
||||
|
||||
def pil2tensor(image: Image) -> torch.Tensor:
|
||||
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
|
||||
|
||||
|
||||
def tensor2pil(t_image: torch.Tensor) -> Image:
|
||||
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
|
||||
|
||||
|
||||
class ImageToSVG(CustomNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(cls):
|
||||
|
||||
106
tests/inference/workflows/qwen2-vl-0.json
Normal file
106
tests/inference/workflows/qwen2-vl-0.json
Normal file
@ -0,0 +1,106 @@
|
||||
{
|
||||
"4": {
|
||||
"inputs": {
|
||||
"ckpt_name": [
|
||||
"20",
|
||||
0
|
||||
],
|
||||
"subfolder": ""
|
||||
},
|
||||
"class_type": "TransformersLoader",
|
||||
"_meta": {
|
||||
"title": "TransformersLoader"
|
||||
}
|
||||
},
|
||||
"5": {
|
||||
"inputs": {
|
||||
"value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
|
||||
"name": "",
|
||||
"title": "Image to query",
|
||||
"description": "",
|
||||
"__required": true
|
||||
},
|
||||
"class_type": "ImageRequestParameter",
|
||||
"_meta": {
|
||||
"title": "ImageRequestParameter"
|
||||
}
|
||||
},
|
||||
"7": {
|
||||
"inputs": {
|
||||
"prompt": "Describe the contents of this image.",
|
||||
"chat_template": "default",
|
||||
"model": [
|
||||
"4",
|
||||
0
|
||||
],
|
||||
"images": [
|
||||
"5",
|
||||
0
|
||||
]
|
||||
},
|
||||
"class_type": "OneShotInstructTokenize",
|
||||
"_meta": {
|
||||
"title": "OneShotInstructTokenize"
|
||||
}
|
||||
},
|
||||
"9": {
|
||||
"inputs": {
|
||||
"max_new_tokens": 512,
|
||||
"repetition_penalty": 0,
|
||||
"seed": 2598326659,
|
||||
"__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
|
||||
"model": [
|
||||
"4",
|
||||
0
|
||||
],
|
||||
"tokens": [
|
||||
"7",
|
||||
0
|
||||
]
|
||||
},
|
||||
"class_type": "TransformersGenerate",
|
||||
"_meta": {
|
||||
"title": "TransformersGenerate"
|
||||
}
|
||||
},
|
||||
"11": {
|
||||
"inputs": {
|
||||
"value": [
|
||||
"9",
|
||||
0
|
||||
],
|
||||
"filename_prefix": "ComfyUI",
|
||||
"extension": ".txt",
|
||||
"output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
|
||||
},
|
||||
"class_type": "SaveString",
|
||||
"_meta": {
|
||||
"title": "SaveString"
|
||||
}
|
||||
},
|
||||
"20": {
|
||||
"inputs": {
|
||||
"value": "Qwen/Qwen2-VL-7B-Instruct",
|
||||
"name": "",
|
||||
"title": "",
|
||||
"description": "",
|
||||
"__required": true
|
||||
},
|
||||
"class_type": "StringEnumRequestParameter",
|
||||
"_meta": {
|
||||
"title": "StringEnumRequestParameter"
|
||||
}
|
||||
},
|
||||
"21": {
|
||||
"inputs": {
|
||||
"images": [
|
||||
"5",
|
||||
0
|
||||
]
|
||||
},
|
||||
"class_type": "PreviewImage",
|
||||
"_meta": {
|
||||
"title": "Preview Image"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user