This commit is contained in:
doctorpangloss 2024-09-06 17:44:08 -07:00
parent e8eab4dbc6
commit 25e636fb65
8 changed files with 184 additions and 28 deletions

View File

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal
import numpy as np
import torch
@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
]
class LanguageMessage(TypedDict):
role: Literal["system", "user", "assistant"]
content: str | MessageContent
class MessageContentImage(TypedDict):
url: NotRequired[str]
class MessageContent(TypedDict):
type: Literal["text", "image", "video", "image_url"]
text: NotRequired[str]
image: NotRequired[str]
image_url: NotRequired[MessageContentImage]
min_pixels: NotRequired[int]
max_pixels: NotRequired[int]
LanguagePrompt = list[LanguageMessage]
@runtime_checkable
class LanguageModel(Protocol):
@staticmethod
@ -78,7 +99,7 @@ class LanguageModel(Protocol):
**kwargs) -> str:
...
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
...
@property

View File

@ -7,7 +7,7 @@ import operator
import pathlib
import warnings
from functools import reduce
from typing import Optional, Any, Callable, List
from typing import Optional, Any, Callable
import torch
from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN
from .chat_templates import KNOWN_CHAT_TEMPLATES
from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
LLaVAProcessor, LanguageModel
LLaVAProcessor, LanguageModel, LanguagePrompt
from .. import model_management
from ..component_model.tensor_types import RGBImageBatch
from ..model_downloader import get_or_download_huggingface_repo
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
from ..model_management_types import ModelManageable
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil
class TransformersManagedModel(ModelManageable, LanguageModel):
@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
if subfolder is not None and subfolder != "":
hub_kwargs["subfolder"] = subfolder
repo_id = ckpt_name
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
with comfy_tqdm():
ckpt_name = get_or_download_huggingface_repo(ckpt_name)
from_pretrained_kwargs = {
"pretrained_model_name_or_path": ckpt_name,
"trust_remote_code": True,
@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
processor.image_processor.do_rescale = False
def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
tokenizer = self.tokenizer
assert tokenizer is not None
assert hasattr(tokenizer, "decode")
@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
if len(candidate_chat_templates) > 0:
filename, chat_template = candidate_chat_templates[0]
logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
if isinstance(images, list):
images = torch.stack(images, dim=0)
if images is not None:
# PIL it for the sake of simplicity
image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
else:
image_sizes = []
images = []
try:
if hasattr(tokenizer, "apply_chat_template"):
# todo: this should come from node inputs
prompt = tokenizer.apply_chat_template([
{"role": "user", "content": prompt},
], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
messages: LanguagePrompt
if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
messages = prompt
elif "content[" in chat_template:
messages = [
{"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
] + [
{"type": "image"} for _ in range(len(images))
]
}
]
else:
messages = [
{"role": "user", "content": prompt},
]
prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
except Exception as exc:
logging.debug("Could not apply chat template", exc_info=exc)
if self.processor is None:
if self.processor is None and isinstance(prompt, str):
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
return {**batch_encoding}
else:
assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
if hasattr(self.processor, "to"):
self.processor.to(device=self.load_device)
assert "<image>" in prompt.lower(), "You must specify a &lt;image&gt; token inside the prompt for it to be substituted correctly by a HuggingFace processor"
batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
if hasattr(self.processor, "to"):
self.processor.to(device=self.offload_device)
assert "input_ids" in batch_feature
batch_feature.to(device=self.load_device, dtype=self.model_dtype())
# noinspection PyTypeChecker
return {
"image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
"image_sizes": image_sizes,
"images": batch_feature["pixel_values"],
"inputs": batch_feature["input_ids"],
**batch_feature

View File

@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
'facebook/nllb-200-distilled-1.3B',
'THUDM/chatglm3-6b',
'roborovski/superprompt-v1',
'Qwen/Qwen2-VL-7B-Instruct',
}
KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([

View File

@ -7,9 +7,9 @@ import math
import random
import logging
from PIL import Image, ImageOps, ImageSequence, ImageFile
from PIL import Image, ImageOps, ImageSequence
from PIL.PngImagePlugin import PngInfo
from huggingface_hub import hf_hub_download, snapshot_download
from huggingface_hub import snapshot_download
from natsort import natsorted
import numpy as np
import safetensors.torch

View File

@ -948,3 +948,11 @@ def seed_for_block(seed):
np.random.set_state(numpy_rng_state)
if torch.cuda.is_available():
torch.cuda.set_rng_state_all(cuda_rng_state)
def pil2tensor(image: Image) -> torch.Tensor:
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
def tensor2pil(t_image: torch.Tensor) -> Image:
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))

View File

@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
from comfy.cli_args import args
from comfy.component_model.tensor_types import RGBImageBatch
from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
TransformerStreamedProgress
TransformerStreamedProgress, LanguagePrompt
from comfy.nodes.package_typing import CustomNode, InputTypes
from comfy.utils import comfy_progress, ProgressBar, seed_for_block
@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):
return full_response
def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
# OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
return {
"inputs": [prompt],

View File

@ -7,6 +7,7 @@ import vtracer
from PIL import Image
from comfy.nodes.package_typing import CustomNode
from comfy.utils import tensor2pil
def RGB2RGBA(image: Image, mask: Image) -> Image:
@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
return Image.merge('RGBA', (R, G, B, mask.convert('L')))
def pil2tensor(image: Image) -> torch.Tensor:
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
def tensor2pil(t_image: torch.Tensor) -> Image:
return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
class ImageToSVG(CustomNode):
@classmethod
def INPUT_TYPES(cls):

View File

@ -0,0 +1,106 @@
{
"4": {
"inputs": {
"ckpt_name": [
"20",
0
],
"subfolder": ""
},
"class_type": "TransformersLoader",
"_meta": {
"title": "TransformersLoader"
}
},
"5": {
"inputs": {
"value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
"name": "",
"title": "Image to query",
"description": "",
"__required": true
},
"class_type": "ImageRequestParameter",
"_meta": {
"title": "ImageRequestParameter"
}
},
"7": {
"inputs": {
"prompt": "Describe the contents of this image.",
"chat_template": "default",
"model": [
"4",
0
],
"images": [
"5",
0
]
},
"class_type": "OneShotInstructTokenize",
"_meta": {
"title": "OneShotInstructTokenize"
}
},
"9": {
"inputs": {
"max_new_tokens": 512,
"repetition_penalty": 0,
"seed": 2598326659,
"__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
"model": [
"4",
0
],
"tokens": [
"7",
0
]
},
"class_type": "TransformersGenerate",
"_meta": {
"title": "TransformersGenerate"
}
},
"11": {
"inputs": {
"value": [
"9",
0
],
"filename_prefix": "ComfyUI",
"extension": ".txt",
"output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
},
"class_type": "SaveString",
"_meta": {
"title": "SaveString"
}
},
"20": {
"inputs": {
"value": "Qwen/Qwen2-VL-7B-Instruct",
"name": "",
"title": "",
"description": "",
"__required": true
},
"class_type": "StringEnumRequestParameter",
"_meta": {
"title": "StringEnumRequestParameter"
}
},
"21": {
"inputs": {
"images": [
"5",
0
]
},
"class_type": "PreviewImage",
"_meta": {
"title": "Preview Image"
}
}
}