mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-10 14:20:49 +08:00
improve images and videos support
This commit is contained in:
parent
4349fac71a
commit
8e282aea6d
@ -196,6 +196,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
|||||||
parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.")
|
parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.")
|
||||||
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
|
parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
|
||||||
parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.")
|
parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.")
|
||||||
|
parser.add_argument("--enable-video-to-image-fallback", action="store_true", help="Enable fallback to convert video frames to images for models that do not natively support video inputs.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--front-end-version",
|
"--front-end-version",
|
||||||
@ -298,6 +299,7 @@ def _create_parser() -> EnhancedConfigArgParser:
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error("Failed to load custom config plugin", exc_info=exc)
|
logger.error("Failed to load custom config plugin", exc_info=exc)
|
||||||
|
|
||||||
|
parser.add_argument("--disable-requests-caching", action="store_true", help="Disable requests caching (useful for testing)")
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -250,6 +250,7 @@ class Configuration(dict):
|
|||||||
self.external_address: Optional[str] = None
|
self.external_address: Optional[str] = None
|
||||||
self.disable_known_models: bool = False
|
self.disable_known_models: bool = False
|
||||||
self.max_queue_size: int = 65536
|
self.max_queue_size: int = 65536
|
||||||
|
self.disable_requests_caching: bool = False
|
||||||
self.force_channels_last: bool = False
|
self.force_channels_last: bool = False
|
||||||
self.force_hf_local_dir_mode = False
|
self.force_hf_local_dir_mode = False
|
||||||
self.preview_size: int = 512
|
self.preview_size: int = 512
|
||||||
@ -290,6 +291,7 @@ class Configuration(dict):
|
|||||||
self.default_device: Optional[int] = None
|
self.default_device: Optional[int] = None
|
||||||
self.block_runtime_package_installation = None
|
self.block_runtime_package_installation = None
|
||||||
self.enable_eval: Optional[bool] = False
|
self.enable_eval: Optional[bool] = False
|
||||||
|
self.enable_video_to_image_fallback: bool = False
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
self[key] = value
|
self[key] = value
|
||||||
|
|||||||
@ -30,6 +30,7 @@ from ..model_downloader import get_or_download_huggingface_repo
|
|||||||
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
|
||||||
from ..model_management_types import ModelManageableStub
|
from ..model_management_types import ModelManageableStub
|
||||||
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
|
from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
|
||||||
|
from ..cli_args import args
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -519,6 +520,20 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("Could not apply chat template", exc_info=exc)
|
logger.debug("Could not apply chat template", exc_info=exc)
|
||||||
|
|
||||||
|
if isinstance(prompt, list):
|
||||||
|
# Fallback: extract text from messages if chat template application failed or wasn't available
|
||||||
|
extracted_text = []
|
||||||
|
for message in prompt:
|
||||||
|
if isinstance(message, dict) and "content" in message:
|
||||||
|
content = message["content"]
|
||||||
|
if isinstance(content, str):
|
||||||
|
extracted_text.append(content)
|
||||||
|
elif isinstance(content, list):
|
||||||
|
for item in content:
|
||||||
|
if isinstance(item, dict) and item.get("type") == "text":
|
||||||
|
extracted_text.append(item.get("text", ""))
|
||||||
|
prompt = "\n".join(extracted_text)
|
||||||
|
|
||||||
if self.processor is None and isinstance(prompt, str):
|
if self.processor is None and isinstance(prompt, str):
|
||||||
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
|
||||||
return {**batch_encoding}
|
return {**batch_encoding}
|
||||||
@ -527,15 +542,58 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
|
|||||||
self.processor.to(device=self.load_device)
|
self.processor.to(device=self.load_device)
|
||||||
# convert tuple to list from images.unbind() for paligemma workaround
|
# convert tuple to list from images.unbind() for paligemma workaround
|
||||||
image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None
|
image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None
|
||||||
|
|
||||||
|
# Convert videos to list of list of frames (uint8)
|
||||||
|
if videos is not None and len(videos) > 0:
|
||||||
|
new_videos = []
|
||||||
|
for v in videos:
|
||||||
|
# Convert to uint8 0-255 if float
|
||||||
|
if v.dtype == torch.float32 or v.dtype == torch.float16 or v.dtype == torch.bfloat16:
|
||||||
|
v = (v * 255).to(torch.uint8)
|
||||||
|
# Convert (T, H, W, C) tensor to list of (H, W, C) tensors
|
||||||
|
if v.ndim == 4:
|
||||||
|
new_videos.append(list(v))
|
||||||
|
else:
|
||||||
|
new_videos.append([v]) # Fallback if not 4D
|
||||||
|
videos = new_videos
|
||||||
|
|
||||||
|
# Check if processor accepts 'videos' argument
|
||||||
|
import inspect
|
||||||
|
processor_params = inspect.signature(self.processor).parameters
|
||||||
|
has_videos_arg = "videos" in processor_params
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"text": [prompt],
|
||||||
|
"images": image_tensor_list,
|
||||||
|
"return_tensors": "pt",
|
||||||
|
"padding": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
if has_videos_arg:
|
||||||
|
kwargs["videos"] = videos
|
||||||
|
if "input_data_format" in processor_params:
|
||||||
|
kwargs["input_data_format"] = "channels_last"
|
||||||
|
elif videos is not None and len(videos) > 0:
|
||||||
|
if args.enable_video_to_image_fallback:
|
||||||
|
# Fallback: flatten video frames into images if processor doesn't support 'videos'
|
||||||
|
# videos is List[List[Frame]] where Frame is (H, W, C)
|
||||||
|
flattened_frames = []
|
||||||
|
for video in videos:
|
||||||
|
flattened_frames.extend(video)
|
||||||
|
|
||||||
|
# Convert list of frames to list of tensors if needed, or just append to images list
|
||||||
|
# images is currently a list of tensors
|
||||||
|
if kwargs["images"] is None:
|
||||||
|
kwargs["images"] = []
|
||||||
|
|
||||||
|
# Ensure frames are in the same format as images (tensors)
|
||||||
|
# Frames in videos are already tensors (uint8)
|
||||||
|
kwargs["images"].extend(flattened_frames)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Model {self.model.name_or_path} does not support video inputs and video-to-image fallback is disabled. Use --enable-video-to-image-fallback to enable it.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
batch_feature: BatchFeature = self.processor(
|
batch_feature: BatchFeature = self.processor(**kwargs)
|
||||||
text=[prompt],
|
|
||||||
images=image_tensor_list,
|
|
||||||
videos=None if videos is not None and len(videos) == 0 or (hasattr(videos, "shape") and videos.shape[0]) == 0 else videos,
|
|
||||||
return_tensors="pt",
|
|
||||||
padding=True,
|
|
||||||
input_data_format="channels_last" # Ensure this is set for Qwen
|
|
||||||
)
|
|
||||||
except TypeError as exc_info:
|
except TypeError as exc_info:
|
||||||
logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated")
|
logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated")
|
||||||
raise exc_info
|
raise exc_info
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import pathlib
|
|||||||
import requests_cache
|
import requests_cache
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from .cli_args import args
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_requests_caching(
|
def use_requests_caching(
|
||||||
@ -35,5 +36,9 @@ def use_requests_caching(
|
|||||||
kwargs.setdefault('use_cache_dir', not path_provided)
|
kwargs.setdefault('use_cache_dir', not path_provided)
|
||||||
kwargs.setdefault('cache_control', cache_control)
|
kwargs.setdefault('cache_control', cache_control)
|
||||||
|
|
||||||
|
if args.disable_requests_caching:
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
with requests_cache.enabled(cache_name, **kwargs):
|
with requests_cache.enabled(cache_name, **kwargs):
|
||||||
yield
|
yield
|
||||||
|
|||||||
47
tests/inference/test_mixed_media_generic.py
Normal file
47
tests/inference/test_mixed_media_generic.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import pytest
|
||||||
|
from comfy_execution.graph_utils import GraphBuilder
|
||||||
|
from comfy.client.embedded_comfy_client import Comfy
|
||||||
|
from comfy.api.components.schema.prompt import Prompt
|
||||||
|
|
||||||
|
class TestMixedMediaGeneric:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_mixed_media_generic(self):
|
||||||
|
graph = GraphBuilder()
|
||||||
|
|
||||||
|
# Load BLIP (small, standard model, image-only processor)
|
||||||
|
model_loader = graph.node("TransformersLoader1", ckpt_name="Salesforce/blip-image-captioning-base")
|
||||||
|
|
||||||
|
# Load video (Goat)
|
||||||
|
video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
|
||||||
|
# Use frame cap to keep it light
|
||||||
|
load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
|
||||||
|
|
||||||
|
# Load image (Worm)
|
||||||
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
|
||||||
|
load_image = graph.node("LoadImageFromURL", value=image_url)
|
||||||
|
|
||||||
|
# Tokenize with both video and image
|
||||||
|
# BLIP expects "images" (list of tensors) if we use the processor correctly.
|
||||||
|
# My fallback logic should convert video frames to images.
|
||||||
|
tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="a photography of", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
|
||||||
|
|
||||||
|
# OmitThink
|
||||||
|
omit_think = graph.node("OmitThink", value=generation.out(0))
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
graph.node("SaveString", value=omit_think.out(0), filename_prefix="mixed_media_test")
|
||||||
|
|
||||||
|
workflow = graph.finalize()
|
||||||
|
prompt = Prompt.validate(workflow)
|
||||||
|
|
||||||
|
from comfy.cli_args import default_configuration
|
||||||
|
config = default_configuration()
|
||||||
|
config.enable_video_to_image_fallback = True
|
||||||
|
|
||||||
|
async with Comfy(configuration=config) as client:
|
||||||
|
outputs = await client.queue_prompt(prompt)
|
||||||
|
|
||||||
|
assert len(outputs) > 0
|
||||||
45
tests/inference/test_qwen3vl_mixed_media.py
Normal file
45
tests/inference/test_qwen3vl_mixed_media.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import pytest
|
||||||
|
from comfy_execution.graph_utils import GraphBuilder
|
||||||
|
from comfy.client.embedded_comfy_client import Comfy
|
||||||
|
from comfy.api.components.schema.prompt import Prompt
|
||||||
|
|
||||||
|
class TestQwen3VLMixedMedia:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_qwen3vl_mixed_media(self):
|
||||||
|
graph = GraphBuilder()
|
||||||
|
|
||||||
|
# Load Qwen3-VL-2B-Instruct
|
||||||
|
model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen3-VL-2B-Instruct", trust_remote_code=True)
|
||||||
|
|
||||||
|
# Load video (Goat)
|
||||||
|
video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
|
||||||
|
# Use frame cap to keep it light
|
||||||
|
load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
|
||||||
|
|
||||||
|
# Load image (Worm)
|
||||||
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
|
||||||
|
load_image = graph.node("LoadImageFromURL", value=image_url)
|
||||||
|
|
||||||
|
# Tokenize with both video and image
|
||||||
|
# Qwen3-VL likely supports 'videos' input natively like Qwen2-VL
|
||||||
|
tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
|
||||||
|
|
||||||
|
# OmitThink
|
||||||
|
omit_think = graph.node("OmitThink", value=generation.out(0))
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwen3vl_mixed_media_test")
|
||||||
|
|
||||||
|
workflow = graph.finalize()
|
||||||
|
prompt = Prompt.validate(workflow)
|
||||||
|
|
||||||
|
from comfy.cli_args_types import Configuration
|
||||||
|
config = Configuration()
|
||||||
|
config.disable_requests_caching = True
|
||||||
|
async with Comfy(configuration=config) as client:
|
||||||
|
outputs = await client.queue_prompt(prompt)
|
||||||
|
|
||||||
|
assert len(outputs) > 0
|
||||||
41
tests/inference/test_qwenvl_mixed_media.py
Normal file
41
tests/inference/test_qwenvl_mixed_media.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import pytest
|
||||||
|
from comfy_execution.graph_utils import GraphBuilder
|
||||||
|
from comfy.client.embedded_comfy_client import Comfy
|
||||||
|
from comfy.api.components.schema.prompt import Prompt
|
||||||
|
|
||||||
|
class TestQwenVLMixedMedia:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_qwenvl_mixed_media(self):
|
||||||
|
graph = GraphBuilder()
|
||||||
|
|
||||||
|
# Load Qwen2-VL-2B-Instruct
|
||||||
|
model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
|
||||||
|
|
||||||
|
# Load video (Goat)
|
||||||
|
video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
|
||||||
|
# Use frame cap to keep it light
|
||||||
|
load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
|
||||||
|
|
||||||
|
# Load image (Worm)
|
||||||
|
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
|
||||||
|
load_image = graph.node("LoadImageFromURL", value=image_url)
|
||||||
|
|
||||||
|
# Tokenize with both video and image
|
||||||
|
tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
|
||||||
|
|
||||||
|
# OmitThink
|
||||||
|
omit_think = graph.node("OmitThink", value=generation.out(0))
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_mixed_media_test")
|
||||||
|
|
||||||
|
workflow = graph.finalize()
|
||||||
|
prompt = Prompt.validate(workflow)
|
||||||
|
|
||||||
|
async with Comfy() as client:
|
||||||
|
outputs = await client.queue_prompt(prompt)
|
||||||
|
|
||||||
|
assert len(outputs) > 0
|
||||||
39
tests/inference/test_qwenvl_video.py
Normal file
39
tests/inference/test_qwenvl_video.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import pytest
|
||||||
|
from comfy_execution.graph_utils import GraphBuilder
|
||||||
|
from comfy.client.embedded_comfy_client import Comfy
|
||||||
|
from comfy.api.components.schema.prompt import Prompt
|
||||||
|
|
||||||
|
class TestQwenVLVideo:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_qwenvl_video_loading(self):
|
||||||
|
graph = GraphBuilder()
|
||||||
|
|
||||||
|
# Load QwenVL model (using a small one as requested)
|
||||||
|
# Qwen/Qwen2-VL-2B-Instruct is a good candidate for a "small" QwenVL model
|
||||||
|
model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
|
||||||
|
|
||||||
|
# Load video from URL with frame cap to avoid OOM
|
||||||
|
video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
|
||||||
|
load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
|
||||||
|
|
||||||
|
# Tokenize with video
|
||||||
|
# OneShotInstructTokenize has optional 'videos' input
|
||||||
|
tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe this video.", videos=load_video.out(0), chat_template="default")
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=50, seed=42)
|
||||||
|
|
||||||
|
# OmitThink (as requested)
|
||||||
|
omit_think = graph.node("OmitThink", value=generation.out(0))
|
||||||
|
|
||||||
|
# Save output
|
||||||
|
graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_video_test")
|
||||||
|
|
||||||
|
workflow = graph.finalize()
|
||||||
|
prompt = Prompt.validate(workflow)
|
||||||
|
|
||||||
|
async with Comfy() as client:
|
||||||
|
outputs = await client.queue_prompt(prompt)
|
||||||
|
|
||||||
|
# We expect it to fail before this, but if it succeeds, we should check the output
|
||||||
|
assert len(outputs) > 0
|
||||||
Loading…
Reference in New Issue
Block a user