From 8e282aea6d8844d8ae729cab0e7da1112c5d292a Mon Sep 17 00:00:00 2001 From: doctorpangloss <2229300+doctorpangloss@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:28:56 -0800 Subject: [PATCH] improve images and videos support --- comfy/cli_args.py | 2 + comfy/cli_args_types.py | 2 + .../language/transformers_model_management.py | 74 +++++++++++++++++-- comfy/node_requests_caching.py | 5 ++ tests/inference/test_mixed_media_generic.py | 47 ++++++++++++ tests/inference/test_qwen3vl_mixed_media.py | 45 +++++++++++ tests/inference/test_qwenvl_mixed_media.py | 41 ++++++++++ tests/inference/test_qwenvl_video.py | 39 ++++++++++ 8 files changed, 247 insertions(+), 8 deletions(-) create mode 100644 tests/inference/test_mixed_media_generic.py create mode 100644 tests/inference/test_qwen3vl_mixed_media.py create mode 100644 tests/inference/test_qwenvl_mixed_media.py create mode 100644 tests/inference/test_qwenvl_video.py diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 63157dd24..d8857597b 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -196,6 +196,7 @@ def _create_parser() -> EnhancedConfigArgParser: parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.") parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.") parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.") + parser.add_argument("--enable-video-to-image-fallback", action="store_true", help="Enable fallback to convert video frames to images for models that do not natively support video inputs.") parser.add_argument( "--front-end-version", @@ -298,6 +299,7 @@ def _create_parser() -> EnhancedConfigArgParser: except Exception as exc: logger.error("Failed to load custom config plugin", exc_info=exc) + parser.add_argument("--disable-requests-caching", action="store_true", help="Disable requests caching (useful for testing)") return parser diff --git a/comfy/cli_args_types.py b/comfy/cli_args_types.py index 903f46f1a..ee377936f 100644 --- a/comfy/cli_args_types.py +++ b/comfy/cli_args_types.py @@ -250,6 +250,7 @@ class Configuration(dict): self.external_address: Optional[str] = None self.disable_known_models: bool = False self.max_queue_size: int = 65536 + self.disable_requests_caching: bool = False self.force_channels_last: bool = False self.force_hf_local_dir_mode = False self.preview_size: int = 512 @@ -290,6 +291,7 @@ class Configuration(dict): self.default_device: Optional[int] = None self.block_runtime_package_installation = None self.enable_eval: Optional[bool] = False + self.enable_video_to_image_fallback: bool = False for key, value in kwargs.items(): self[key] = value diff --git a/comfy/language/transformers_model_management.py b/comfy/language/transformers_model_management.py index 83d703d5e..66f29f9bc 100644 --- a/comfy/language/transformers_model_management.py +++ b/comfy/language/transformers_model_management.py @@ -30,6 +30,7 @@ from ..model_downloader import get_or_download_huggingface_repo from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu from ..model_management_types import ModelManageableStub from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block +from ..cli_args import args logger = logging.getLogger(__name__) @@ -519,6 +520,20 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel): except Exception as exc: logger.debug("Could not apply chat template", exc_info=exc) + if isinstance(prompt, list): + # Fallback: extract text from messages if chat template application failed or wasn't available + extracted_text = [] + for message in prompt: + if isinstance(message, dict) and "content" in message: + content = message["content"] + if isinstance(content, str): + extracted_text.append(content) + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + extracted_text.append(item.get("text", "")) + prompt = "\n".join(extracted_text) + if self.processor is None and isinstance(prompt, str): batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device) return {**batch_encoding} @@ -527,15 +542,58 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel): self.processor.to(device=self.load_device) # convert tuple to list from images.unbind() for paligemma workaround image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None + + # Convert videos to list of list of frames (uint8) + if videos is not None and len(videos) > 0: + new_videos = [] + for v in videos: + # Convert to uint8 0-255 if float + if v.dtype == torch.float32 or v.dtype == torch.float16 or v.dtype == torch.bfloat16: + v = (v * 255).to(torch.uint8) + # Convert (T, H, W, C) tensor to list of (H, W, C) tensors + if v.ndim == 4: + new_videos.append(list(v)) + else: + new_videos.append([v]) # Fallback if not 4D + videos = new_videos + + # Check if processor accepts 'videos' argument + import inspect + processor_params = inspect.signature(self.processor).parameters + has_videos_arg = "videos" in processor_params + + kwargs = { + "text": [prompt], + "images": image_tensor_list, + "return_tensors": "pt", + "padding": True, + } + + if has_videos_arg: + kwargs["videos"] = videos + if "input_data_format" in processor_params: + kwargs["input_data_format"] = "channels_last" + elif videos is not None and len(videos) > 0: + if args.enable_video_to_image_fallback: + # Fallback: flatten video frames into images if processor doesn't support 'videos' + # videos is List[List[Frame]] where Frame is (H, W, C) + flattened_frames = [] + for video in videos: + flattened_frames.extend(video) + + # Convert list of frames to list of tensors if needed, or just append to images list + # images is currently a list of tensors + if kwargs["images"] is None: + kwargs["images"] = [] + + # Ensure frames are in the same format as images (tensors) + # Frames in videos are already tensors (uint8) + kwargs["images"].extend(flattened_frames) + else: + logger.warning(f"Model {self.model.name_or_path} does not support video inputs and video-to-image fallback is disabled. Use --enable-video-to-image-fallback to enable it.") + try: - batch_feature: BatchFeature = self.processor( - text=[prompt], - images=image_tensor_list, - videos=None if videos is not None and len(videos) == 0 or (hasattr(videos, "shape") and videos.shape[0]) == 0 else videos, - return_tensors="pt", - padding=True, - input_data_format="channels_last" # Ensure this is set for Qwen - ) + batch_feature: BatchFeature = self.processor(**kwargs) except TypeError as exc_info: logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated") raise exc_info diff --git a/comfy/node_requests_caching.py b/comfy/node_requests_caching.py index 5aa5385e8..930a8c89e 100644 --- a/comfy/node_requests_caching.py +++ b/comfy/node_requests_caching.py @@ -4,6 +4,7 @@ import pathlib import requests_cache from contextlib import contextmanager +from .cli_args import args @contextmanager def use_requests_caching( @@ -35,5 +36,9 @@ def use_requests_caching( kwargs.setdefault('use_cache_dir', not path_provided) kwargs.setdefault('cache_control', cache_control) + if args.disable_requests_caching: + yield + return + with requests_cache.enabled(cache_name, **kwargs): yield diff --git a/tests/inference/test_mixed_media_generic.py b/tests/inference/test_mixed_media_generic.py new file mode 100644 index 000000000..7d0647ccd --- /dev/null +++ b/tests/inference/test_mixed_media_generic.py @@ -0,0 +1,47 @@ +import pytest +from comfy_execution.graph_utils import GraphBuilder +from comfy.client.embedded_comfy_client import Comfy +from comfy.api.components.schema.prompt import Prompt + +class TestMixedMediaGeneric: + @pytest.mark.asyncio + async def test_mixed_media_generic(self): + graph = GraphBuilder() + + # Load BLIP (small, standard model, image-only processor) + model_loader = graph.node("TransformersLoader1", ckpt_name="Salesforce/blip-image-captioning-base") + + # Load video (Goat) + video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm" + # Use frame cap to keep it light + load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10) + + # Load image (Worm) + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg" + load_image = graph.node("LoadImageFromURL", value=image_url) + + # Tokenize with both video and image + # BLIP expects "images" (list of tensors) if we use the processor correctly. + # My fallback logic should convert video frames to images. + tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="a photography of", videos=load_video.out(0), images=load_image.out(0), chat_template="default") + + # Generate + generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42) + + # OmitThink + omit_think = graph.node("OmitThink", value=generation.out(0)) + + # Save output + graph.node("SaveString", value=omit_think.out(0), filename_prefix="mixed_media_test") + + workflow = graph.finalize() + prompt = Prompt.validate(workflow) + + from comfy.cli_args import default_configuration + config = default_configuration() + config.enable_video_to_image_fallback = True + + async with Comfy(configuration=config) as client: + outputs = await client.queue_prompt(prompt) + + assert len(outputs) > 0 diff --git a/tests/inference/test_qwen3vl_mixed_media.py b/tests/inference/test_qwen3vl_mixed_media.py new file mode 100644 index 000000000..61fb8d17b --- /dev/null +++ b/tests/inference/test_qwen3vl_mixed_media.py @@ -0,0 +1,45 @@ +import pytest +from comfy_execution.graph_utils import GraphBuilder +from comfy.client.embedded_comfy_client import Comfy +from comfy.api.components.schema.prompt import Prompt + +class TestQwen3VLMixedMedia: + @pytest.mark.asyncio + async def test_qwen3vl_mixed_media(self): + graph = GraphBuilder() + + # Load Qwen3-VL-2B-Instruct + model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen3-VL-2B-Instruct", trust_remote_code=True) + + # Load video (Goat) + video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm" + # Use frame cap to keep it light + load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10) + + # Load image (Worm) + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg" + load_image = graph.node("LoadImageFromURL", value=image_url) + + # Tokenize with both video and image + # Qwen3-VL likely supports 'videos' input natively like Qwen2-VL + tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default") + + # Generate + generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42) + + # OmitThink + omit_think = graph.node("OmitThink", value=generation.out(0)) + + # Save output + graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwen3vl_mixed_media_test") + + workflow = graph.finalize() + prompt = Prompt.validate(workflow) + + from comfy.cli_args_types import Configuration + config = Configuration() + config.disable_requests_caching = True + async with Comfy(configuration=config) as client: + outputs = await client.queue_prompt(prompt) + + assert len(outputs) > 0 diff --git a/tests/inference/test_qwenvl_mixed_media.py b/tests/inference/test_qwenvl_mixed_media.py new file mode 100644 index 000000000..009244587 --- /dev/null +++ b/tests/inference/test_qwenvl_mixed_media.py @@ -0,0 +1,41 @@ +import pytest +from comfy_execution.graph_utils import GraphBuilder +from comfy.client.embedded_comfy_client import Comfy +from comfy.api.components.schema.prompt import Prompt + +class TestQwenVLMixedMedia: + @pytest.mark.asyncio + async def test_qwenvl_mixed_media(self): + graph = GraphBuilder() + + # Load Qwen2-VL-2B-Instruct + model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct") + + # Load video (Goat) + video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm" + # Use frame cap to keep it light + load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10) + + # Load image (Worm) + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg" + load_image = graph.node("LoadImageFromURL", value=image_url) + + # Tokenize with both video and image + tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default") + + # Generate + generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42) + + # OmitThink + omit_think = graph.node("OmitThink", value=generation.out(0)) + + # Save output + graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_mixed_media_test") + + workflow = graph.finalize() + prompt = Prompt.validate(workflow) + + async with Comfy() as client: + outputs = await client.queue_prompt(prompt) + + assert len(outputs) > 0 diff --git a/tests/inference/test_qwenvl_video.py b/tests/inference/test_qwenvl_video.py new file mode 100644 index 000000000..dc52c5c99 --- /dev/null +++ b/tests/inference/test_qwenvl_video.py @@ -0,0 +1,39 @@ +import pytest +from comfy_execution.graph_utils import GraphBuilder +from comfy.client.embedded_comfy_client import Comfy +from comfy.api.components.schema.prompt import Prompt + +class TestQwenVLVideo: + @pytest.mark.asyncio + async def test_qwenvl_video_loading(self): + graph = GraphBuilder() + + # Load QwenVL model (using a small one as requested) + # Qwen/Qwen2-VL-2B-Instruct is a good candidate for a "small" QwenVL model + model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct") + + # Load video from URL with frame cap to avoid OOM + video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm" + load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10) + + # Tokenize with video + # OneShotInstructTokenize has optional 'videos' input + tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe this video.", videos=load_video.out(0), chat_template="default") + + # Generate + generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=50, seed=42) + + # OmitThink (as requested) + omit_think = graph.node("OmitThink", value=generation.out(0)) + + # Save output + graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_video_test") + + workflow = graph.finalize() + prompt = Prompt.validate(workflow) + + async with Comfy() as client: + outputs = await client.queue_prompt(prompt) + + # We expect it to fail before this, but if it succeeds, we should check the output + assert len(outputs) > 0