mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-11 06:40:48 +08:00
Qwen2
This commit is contained in:
parent
e8eab4dbc6
commit
25e636fb65
@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
|
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageMessage(TypedDict):
|
||||||
|
role: Literal["system", "user", "assistant"]
|
||||||
|
content: str | MessageContent
|
||||||
|
|
||||||
|
|
||||||
|
class MessageContentImage(TypedDict):
|
||||||
|
url: NotRequired[str]
|
||||||
|
|
||||||
|
|
||||||
|
class MessageContent(TypedDict):
|
||||||
|
type: Literal["text", "image", "video", "image_url"]
|
||||||
|
text: NotRequired[str]
|
||||||
|
image: NotRequired[str]
|
||||||
|
image_url: NotRequired[MessageContentImage]
|
||||||
|
min_pixels: NotRequired[int]
|
||||||
|
max_pixels: NotRequired[int]
|
||||||
|
|
||||||
|
|
||||||
|
LanguagePrompt = list[LanguageMessage]
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class LanguageModel(Protocol):
|
class LanguageModel(Protocol):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -78,7 +99,7 @@ class LanguageModel(Protocol):
|
|||||||
**kwargs) -> str:
|
**kwargs) -> str:
|
||||||
...
|
...
|
||||||
|
|
||||||
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
|
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||||
...
|
...
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import operator
|
|||||||
import pathlib
|
import pathlib
|
||||||
import warnings
|
import warnings
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import Optional, Any, Callable, List
|
from typing import Optional, Any, Callable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
|
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
|
||||||
@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN
|
|||||||
|
|
||||||
from .chat_templates import KNOWN_CHAT_TEMPLATES
|
from .chat_templates import KNOWN_CHAT_TEMPLATES
|
||||||
from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
|
from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
|
||||||
LLaVAProcessor, LanguageModel
|
LLaVAProcessor, LanguageModel, LanguagePrompt
|
||||||
from .. import model_management
|
from .. import model_management
|
||||||
|
from ..component_model.tensor_types import RGBImageBatch
|
||||||
from ..model_downloader import get_or_download_huggingface_repo
|
from ..model_downloader import get_or_download_huggingface_repo
|
||||||
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
||||||
from ..model_management_types import ModelManageable
|
from ..model_management_types import ModelManageable
|
||||||
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
|
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil
|
||||||
|
|
||||||
|
|
||||||
class TransformersManagedModel(ModelManageable, LanguageModel):
|
class TransformersManagedModel(ModelManageable, LanguageModel):
|
||||||
@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
|||||||
if subfolder is not None and subfolder != "":
|
if subfolder is not None and subfolder != "":
|
||||||
hub_kwargs["subfolder"] = subfolder
|
hub_kwargs["subfolder"] = subfolder
|
||||||
repo_id = ckpt_name
|
repo_id = ckpt_name
|
||||||
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
|
|
||||||
with comfy_tqdm():
|
with comfy_tqdm():
|
||||||
|
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
|
||||||
|
|
||||||
from_pretrained_kwargs = {
|
from_pretrained_kwargs = {
|
||||||
"pretrained_model_name_or_path": ckpt_name,
|
"pretrained_model_name_or_path": ckpt_name,
|
||||||
"trust_remote_code": True,
|
"trust_remote_code": True,
|
||||||
@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
|||||||
if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
|
if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
|
||||||
processor.image_processor.do_rescale = False
|
processor.image_processor.do_rescale = False
|
||||||
|
|
||||||
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
|
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||||
tokenizer = self.tokenizer
|
tokenizer = self.tokenizer
|
||||||
assert tokenizer is not None
|
assert tokenizer is not None
|
||||||
assert hasattr(tokenizer, "decode")
|
assert hasattr(tokenizer, "decode")
|
||||||
@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
|
|||||||
if len(candidate_chat_templates) > 0:
|
if len(candidate_chat_templates) > 0:
|
||||||
filename, chat_template = candidate_chat_templates[0]
|
filename, chat_template = candidate_chat_templates[0]
|
||||||
logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
|
logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
|
||||||
|
if isinstance(images, list):
|
||||||
|
images = torch.stack(images, dim=0)
|
||||||
|
if images is not None:
|
||||||
|
# PIL it for the sake of simplicity
|
||||||
|
image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
|
||||||
|
else:
|
||||||
|
image_sizes = []
|
||||||
|
images = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if hasattr(tokenizer, "apply_chat_template"):
|
if hasattr(tokenizer, "apply_chat_template"):
|
||||||
# todo: this should come from node inputs
|
messages: LanguagePrompt
|
||||||
prompt = tokenizer.apply_chat_template([
|
if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
|
||||||
{"role": "user", "content": prompt},
|
messages = prompt
|
||||||
], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
|
elif "content[" in chat_template:
|
||||||
|
messages = [
|
||||||
|
{"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": prompt
|
||||||
|
}
|
||||||
|
] + [
|
||||||
|
{"type": "image"} for _ in range(len(images))
|
||||||
|
]
|
||||||
|
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
]
|
||||||
|
prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logging.debug("Could not apply chat template", exc_info=exc)
|
logging.debug("Could not apply chat template", exc_info=exc)
|
||||||
|
|
||||||
if self.processor is None:
|
if self.processor is None and isinstance(prompt, str):
|
||||||
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
||||||
return {**batch_encoding}
|
return {**batch_encoding}
|
||||||
else:
|
else:
|
||||||
assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
|
|
||||||
if hasattr(self.processor, "to"):
|
if hasattr(self.processor, "to"):
|
||||||
self.processor.to(device=self.load_device)
|
self.processor.to(device=self.load_device)
|
||||||
|
|
||||||
assert "<image>" in prompt.lower(), "You must specify a <image> token inside the prompt for it to be substituted correctly by a HuggingFace processor"
|
batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
|
||||||
batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
|
|
||||||
if hasattr(self.processor, "to"):
|
if hasattr(self.processor, "to"):
|
||||||
self.processor.to(device=self.offload_device)
|
self.processor.to(device=self.offload_device)
|
||||||
assert "input_ids" in batch_feature
|
assert "input_ids" in batch_feature
|
||||||
batch_feature.to(device=self.load_device, dtype=self.model_dtype())
|
batch_feature.to(device=self.load_device, dtype=self.model_dtype())
|
||||||
# noinspection PyTypeChecker
|
# noinspection PyTypeChecker
|
||||||
return {
|
return {
|
||||||
"image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
|
"image_sizes": image_sizes,
|
||||||
"images": batch_feature["pixel_values"],
|
"images": batch_feature["pixel_values"],
|
||||||
"inputs": batch_feature["input_ids"],
|
"inputs": batch_feature["input_ids"],
|
||||||
**batch_feature
|
**batch_feature
|
||||||
|
|||||||
@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
|
|||||||
'facebook/nllb-200-distilled-1.3B',
|
'facebook/nllb-200-distilled-1.3B',
|
||||||
'THUDM/chatglm3-6b',
|
'THUDM/chatglm3-6b',
|
||||||
'roborovski/superprompt-v1',
|
'roborovski/superprompt-v1',
|
||||||
|
'Qwen/Qwen2-VL-7B-Instruct',
|
||||||
}
|
}
|
||||||
|
|
||||||
KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
|
||||||
|
|||||||
@ -7,9 +7,9 @@ import math
|
|||||||
import random
|
import random
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from PIL import Image, ImageOps, ImageSequence, ImageFile
|
from PIL import Image, ImageOps, ImageSequence
|
||||||
from PIL.PngImagePlugin import PngInfo
|
from PIL.PngImagePlugin import PngInfo
|
||||||
from huggingface_hub import hf_hub_download, snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from natsort import natsorted
|
from natsort import natsorted
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import safetensors.torch
|
import safetensors.torch
|
||||||
|
|||||||
@ -948,3 +948,11 @@ def seed_for_block(seed):
|
|||||||
np.random.set_state(numpy_rng_state)
|
np.random.set_state(numpy_rng_state)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.set_rng_state_all(cuda_rng_state)
|
torch.cuda.set_rng_state_all(cuda_rng_state)
|
||||||
|
|
||||||
|
|
||||||
|
def pil2tensor(image: Image) -> torch.Tensor:
|
||||||
|
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
def tensor2pil(t_image: torch.Tensor) -> Image:
|
||||||
|
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
|
|||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
from comfy.component_model.tensor_types import RGBImageBatch
|
from comfy.component_model.tensor_types import RGBImageBatch
|
||||||
from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
|
from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
|
||||||
TransformerStreamedProgress
|
TransformerStreamedProgress, LanguagePrompt
|
||||||
from comfy.nodes.package_typing import CustomNode, InputTypes
|
from comfy.nodes.package_typing import CustomNode, InputTypes
|
||||||
from comfy.utils import comfy_progress, ProgressBar, seed_for_block
|
from comfy.utils import comfy_progress, ProgressBar, seed_for_block
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):
|
|||||||
|
|
||||||
return full_response
|
return full_response
|
||||||
|
|
||||||
def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
|
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
|
||||||
# OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
|
# OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
|
||||||
return {
|
return {
|
||||||
"inputs": [prompt],
|
"inputs": [prompt],
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import vtracer
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from comfy.nodes.package_typing import CustomNode
|
from comfy.nodes.package_typing import CustomNode
|
||||||
|
from comfy.utils import tensor2pil
|
||||||
|
|
||||||
|
|
||||||
def RGB2RGBA(image: Image, mask: Image) -> Image:
|
def RGB2RGBA(image: Image, mask: Image) -> Image:
|
||||||
@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
|
|||||||
return Image.merge('RGBA', (R, G, B, mask.convert('L')))
|
return Image.merge('RGBA', (R, G, B, mask.convert('L')))
|
||||||
|
|
||||||
|
|
||||||
def pil2tensor(image: Image) -> torch.Tensor:
|
|
||||||
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
|
|
||||||
|
|
||||||
|
|
||||||
def tensor2pil(t_image: torch.Tensor) -> Image:
|
|
||||||
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
|
|
||||||
|
|
||||||
|
|
||||||
class ImageToSVG(CustomNode):
|
class ImageToSVG(CustomNode):
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(cls):
|
def INPUT_TYPES(cls):
|
||||||
|
|||||||
106
tests/inference/workflows/qwen2-vl-0.json
Normal file
106
tests/inference/workflows/qwen2-vl-0.json
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
{
|
||||||
|
"4": {
|
||||||
|
"inputs": {
|
||||||
|
"ckpt_name": [
|
||||||
|
"20",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"subfolder": ""
|
||||||
|
},
|
||||||
|
"class_type": "TransformersLoader",
|
||||||
|
"_meta": {
|
||||||
|
"title": "TransformersLoader"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"5": {
|
||||||
|
"inputs": {
|
||||||
|
"value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
|
||||||
|
"name": "",
|
||||||
|
"title": "Image to query",
|
||||||
|
"description": "",
|
||||||
|
"__required": true
|
||||||
|
},
|
||||||
|
"class_type": "ImageRequestParameter",
|
||||||
|
"_meta": {
|
||||||
|
"title": "ImageRequestParameter"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7": {
|
||||||
|
"inputs": {
|
||||||
|
"prompt": "Describe the contents of this image.",
|
||||||
|
"chat_template": "default",
|
||||||
|
"model": [
|
||||||
|
"4",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"images": [
|
||||||
|
"5",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "OneShotInstructTokenize",
|
||||||
|
"_meta": {
|
||||||
|
"title": "OneShotInstructTokenize"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"9": {
|
||||||
|
"inputs": {
|
||||||
|
"max_new_tokens": 512,
|
||||||
|
"repetition_penalty": 0,
|
||||||
|
"seed": 2598326659,
|
||||||
|
"__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
|
||||||
|
"model": [
|
||||||
|
"4",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"tokens": [
|
||||||
|
"7",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "TransformersGenerate",
|
||||||
|
"_meta": {
|
||||||
|
"title": "TransformersGenerate"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"11": {
|
||||||
|
"inputs": {
|
||||||
|
"value": [
|
||||||
|
"9",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"filename_prefix": "ComfyUI",
|
||||||
|
"extension": ".txt",
|
||||||
|
"output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
|
||||||
|
},
|
||||||
|
"class_type": "SaveString",
|
||||||
|
"_meta": {
|
||||||
|
"title": "SaveString"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"20": {
|
||||||
|
"inputs": {
|
||||||
|
"value": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
|
"name": "",
|
||||||
|
"title": "",
|
||||||
|
"description": "",
|
||||||
|
"__required": true
|
||||||
|
},
|
||||||
|
"class_type": "StringEnumRequestParameter",
|
||||||
|
"_meta": {
|
||||||
|
"title": "StringEnumRequestParameter"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"21": {
|
||||||
|
"inputs": {
|
||||||
|
"images": [
|
||||||
|
"5",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "PreviewImage",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Preview Image"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user