From 8e282aea6d8844d8ae729cab0e7da1112c5d292a Mon Sep 17 00:00:00 2001
From: doctorpangloss <2229300+doctorpangloss@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:28:56 -0800
Subject: [PATCH] improve images and videos support

---
 comfy/cli_args.py                             |  2 +
 comfy/cli_args_types.py                       |  2 +
 .../language/transformers_model_management.py | 74 +++++++++++++++++--
 comfy/node_requests_caching.py                |  5 ++
 tests/inference/test_mixed_media_generic.py   | 47 ++++++++++++
 tests/inference/test_qwen3vl_mixed_media.py   | 45 +++++++++++
 tests/inference/test_qwenvl_mixed_media.py    | 41 ++++++++++
 tests/inference/test_qwenvl_video.py          | 39 ++++++++++
 8 files changed, 247 insertions(+), 8 deletions(-)
 create mode 100644 tests/inference/test_mixed_media_generic.py
 create mode 100644 tests/inference/test_qwen3vl_mixed_media.py
 create mode 100644 tests/inference/test_qwenvl_mixed_media.py
 create mode 100644 tests/inference/test_qwenvl_video.py

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 63157dd24..d8857597b 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -196,6 +196,7 @@ def _create_parser() -> EnhancedConfigArgParser:
     parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.")
     parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
     parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.")
+    parser.add_argument("--enable-video-to-image-fallback", action="store_true", help="Enable fallback to convert video frames to images for models that do not natively support video inputs.")
 
     parser.add_argument(
         "--front-end-version",
@@ -298,6 +299,7 @@ def _create_parser() -> EnhancedConfigArgParser:
         except Exception as exc:
             logger.error("Failed to load custom config plugin", exc_info=exc)
 
+    parser.add_argument("--disable-requests-caching", action="store_true", help="Disable requests caching (useful for testing)")
     return parser
 
 
diff --git a/comfy/cli_args_types.py b/comfy/cli_args_types.py
index 903f46f1a..ee377936f 100644
--- a/comfy/cli_args_types.py
+++ b/comfy/cli_args_types.py
@@ -250,6 +250,7 @@ class Configuration(dict):
         self.external_address: Optional[str] = None
         self.disable_known_models: bool = False
         self.max_queue_size: int = 65536
+        self.disable_requests_caching: bool = False
         self.force_channels_last: bool = False
         self.force_hf_local_dir_mode = False
         self.preview_size: int = 512
@@ -290,6 +291,7 @@ class Configuration(dict):
         self.default_device: Optional[int] = None
         self.block_runtime_package_installation = None
         self.enable_eval: Optional[bool] = False
+        self.enable_video_to_image_fallback: bool = False
 
         for key, value in kwargs.items():
             self[key] = value
diff --git a/comfy/language/transformers_model_management.py b/comfy/language/transformers_model_management.py
index 83d703d5e..66f29f9bc 100644
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@@ -30,6 +30,7 @@ from ..model_downloader import get_or_download_huggingface_repo
 from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
 from ..model_management_types import ModelManageableStub
 from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
+from ..cli_args import args
 
 logger = logging.getLogger(__name__)
 
@@ -519,6 +520,20 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
         except Exception as exc:
             logger.debug("Could not apply chat template", exc_info=exc)
 
+        if isinstance(prompt, list):
+            # Fallback: extract text from messages if chat template application failed or wasn't available
+            extracted_text = []
+            for message in prompt:
+                if isinstance(message, dict) and "content" in message:
+                    content = message["content"]
+                    if isinstance(content, str):
+                        extracted_text.append(content)
+                    elif isinstance(content, list):
+                        for item in content:
+                            if isinstance(item, dict) and item.get("type") == "text":
+                                extracted_text.append(item.get("text", ""))
+            prompt = "\n".join(extracted_text)
+
         if self.processor is None and isinstance(prompt, str):
             batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
             return {**batch_encoding}
@@ -527,15 +542,58 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
                 self.processor.to(device=self.load_device)
             # convert tuple to list from images.unbind() for paligemma workaround
             image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None
+            
+            # Convert videos to list of list of frames (uint8)
+            if videos is not None and len(videos) > 0:
+                new_videos = []
+                for v in videos:
+                    # Convert to uint8 0-255 if float
+                    if v.dtype == torch.float32 or v.dtype == torch.float16 or v.dtype == torch.bfloat16:
+                        v = (v * 255).to(torch.uint8)
+                    # Convert (T, H, W, C) tensor to list of (H, W, C) tensors
+                    if v.ndim == 4:
+                        new_videos.append(list(v))
+                    else:
+                        new_videos.append([v]) # Fallback if not 4D
+                videos = new_videos
+
+            # Check if processor accepts 'videos' argument
+            import inspect
+            processor_params = inspect.signature(self.processor).parameters
+            has_videos_arg = "videos" in processor_params
+
+            kwargs = {
+                "text": [prompt],
+                "images": image_tensor_list,
+                "return_tensors": "pt",
+                "padding": True,
+            }
+
+            if has_videos_arg:
+                kwargs["videos"] = videos
+                if "input_data_format" in processor_params:
+                     kwargs["input_data_format"] = "channels_last"
+            elif videos is not None and len(videos) > 0:
+                if args.enable_video_to_image_fallback:
+                    # Fallback: flatten video frames into images if processor doesn't support 'videos'
+                    # videos is List[List[Frame]] where Frame is (H, W, C)
+                    flattened_frames = []
+                    for video in videos:
+                        flattened_frames.extend(video)
+                    
+                    # Convert list of frames to list of tensors if needed, or just append to images list
+                    # images is currently a list of tensors
+                    if kwargs["images"] is None:
+                        kwargs["images"] = []
+                    
+                    # Ensure frames are in the same format as images (tensors)
+                    # Frames in videos are already tensors (uint8)
+                    kwargs["images"].extend(flattened_frames)
+                else:
+                    logger.warning(f"Model {self.model.name_or_path} does not support video inputs and video-to-image fallback is disabled. Use --enable-video-to-image-fallback to enable it.")
+
             try:
-                batch_feature: BatchFeature = self.processor(
-                    text=[prompt],
-                    images=image_tensor_list,
-                    videos=None if videos is not None and len(videos) == 0 or (hasattr(videos, "shape") and videos.shape[0]) == 0 else videos,
-                    return_tensors="pt",
-                    padding=True,
-                    input_data_format="channels_last"  # Ensure this is set for Qwen
-                )
+                batch_feature: BatchFeature = self.processor(**kwargs)
             except TypeError as exc_info:
                 logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated")
                 raise exc_info
diff --git a/comfy/node_requests_caching.py b/comfy/node_requests_caching.py
index 5aa5385e8..930a8c89e 100644
--- a/comfy/node_requests_caching.py
+++ b/comfy/node_requests_caching.py
@@ -4,6 +4,7 @@ import pathlib
 import requests_cache
 from contextlib import contextmanager
 
+from .cli_args import args
 
 @contextmanager
 def use_requests_caching(
@@ -35,5 +36,9 @@ def use_requests_caching(
     kwargs.setdefault('use_cache_dir', not path_provided)
     kwargs.setdefault('cache_control', cache_control)
 
+    if args.disable_requests_caching:
+        yield
+        return
+
     with requests_cache.enabled(cache_name, **kwargs):
         yield
diff --git a/tests/inference/test_mixed_media_generic.py b/tests/inference/test_mixed_media_generic.py
new file mode 100644
index 000000000..7d0647ccd
--- /dev/null
+++ b/tests/inference/test_mixed_media_generic.py
@@ -0,0 +1,47 @@
+import pytest
+from comfy_execution.graph_utils import GraphBuilder
+from comfy.client.embedded_comfy_client import Comfy
+from comfy.api.components.schema.prompt import Prompt
+
+class TestMixedMediaGeneric:
+    @pytest.mark.asyncio
+    async def test_mixed_media_generic(self):
+        graph = GraphBuilder()
+
+        # Load BLIP (small, standard model, image-only processor)
+        model_loader = graph.node("TransformersLoader1", ckpt_name="Salesforce/blip-image-captioning-base")
+
+        # Load video (Goat)
+        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
+        # Use frame cap to keep it light
+        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
+
+        # Load image (Worm)
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
+        load_image = graph.node("LoadImageFromURL", value=image_url)
+
+        # Tokenize with both video and image
+        # BLIP expects "images" (list of tensors) if we use the processor correctly.
+        # My fallback logic should convert video frames to images.
+        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="a photography of", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
+
+        # Generate
+        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
+
+        # OmitThink
+        omit_think = graph.node("OmitThink", value=generation.out(0))
+
+        # Save output
+        graph.node("SaveString", value=omit_think.out(0), filename_prefix="mixed_media_test")
+
+        workflow = graph.finalize()
+        prompt = Prompt.validate(workflow)
+
+        from comfy.cli_args import default_configuration
+        config = default_configuration()
+        config.enable_video_to_image_fallback = True
+
+        async with Comfy(configuration=config) as client:
+            outputs = await client.queue_prompt(prompt)
+            
+        assert len(outputs) > 0
diff --git a/tests/inference/test_qwen3vl_mixed_media.py b/tests/inference/test_qwen3vl_mixed_media.py
new file mode 100644
index 000000000..61fb8d17b
--- /dev/null
+++ b/tests/inference/test_qwen3vl_mixed_media.py
@@ -0,0 +1,45 @@
+import pytest
+from comfy_execution.graph_utils import GraphBuilder
+from comfy.client.embedded_comfy_client import Comfy
+from comfy.api.components.schema.prompt import Prompt
+
+class TestQwen3VLMixedMedia:
+    @pytest.mark.asyncio
+    async def test_qwen3vl_mixed_media(self):
+        graph = GraphBuilder()
+
+        # Load Qwen3-VL-2B-Instruct
+        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen3-VL-2B-Instruct", trust_remote_code=True)
+
+        # Load video (Goat)
+        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
+        # Use frame cap to keep it light
+        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
+
+        # Load image (Worm)
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
+        load_image = graph.node("LoadImageFromURL", value=image_url)
+
+        # Tokenize with both video and image
+        # Qwen3-VL likely supports 'videos' input natively like Qwen2-VL
+        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
+
+        # Generate
+        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
+
+        # OmitThink
+        omit_think = graph.node("OmitThink", value=generation.out(0))
+
+        # Save output
+        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwen3vl_mixed_media_test")
+
+        workflow = graph.finalize()
+        prompt = Prompt.validate(workflow)
+
+        from comfy.cli_args_types import Configuration
+        config = Configuration()
+        config.disable_requests_caching = True
+        async with Comfy(configuration=config) as client:
+            outputs = await client.queue_prompt(prompt)
+            
+        assert len(outputs) > 0
diff --git a/tests/inference/test_qwenvl_mixed_media.py b/tests/inference/test_qwenvl_mixed_media.py
new file mode 100644
index 000000000..009244587
--- /dev/null
+++ b/tests/inference/test_qwenvl_mixed_media.py
@@ -0,0 +1,41 @@
+import pytest
+from comfy_execution.graph_utils import GraphBuilder
+from comfy.client.embedded_comfy_client import Comfy
+from comfy.api.components.schema.prompt import Prompt
+
+class TestQwenVLMixedMedia:
+    @pytest.mark.asyncio
+    async def test_qwenvl_mixed_media(self):
+        graph = GraphBuilder()
+
+        # Load Qwen2-VL-2B-Instruct
+        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
+
+        # Load video (Goat)
+        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
+        # Use frame cap to keep it light
+        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
+
+        # Load image (Worm)
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
+        load_image = graph.node("LoadImageFromURL", value=image_url)
+
+        # Tokenize with both video and image
+        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
+
+        # Generate
+        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
+
+        # OmitThink
+        omit_think = graph.node("OmitThink", value=generation.out(0))
+
+        # Save output
+        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_mixed_media_test")
+
+        workflow = graph.finalize()
+        prompt = Prompt.validate(workflow)
+
+        async with Comfy() as client:
+            outputs = await client.queue_prompt(prompt)
+            
+        assert len(outputs) > 0
diff --git a/tests/inference/test_qwenvl_video.py b/tests/inference/test_qwenvl_video.py
new file mode 100644
index 000000000..dc52c5c99
--- /dev/null
+++ b/tests/inference/test_qwenvl_video.py
@@ -0,0 +1,39 @@
+import pytest
+from comfy_execution.graph_utils import GraphBuilder
+from comfy.client.embedded_comfy_client import Comfy
+from comfy.api.components.schema.prompt import Prompt
+
+class TestQwenVLVideo:
+    @pytest.mark.asyncio
+    async def test_qwenvl_video_loading(self):
+        graph = GraphBuilder()
+
+        # Load QwenVL model (using a small one as requested)
+        # Qwen/Qwen2-VL-2B-Instruct is a good candidate for a "small" QwenVL model
+        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
+
+        # Load video from URL with frame cap to avoid OOM
+        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
+        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
+
+        # Tokenize with video
+        # OneShotInstructTokenize has optional 'videos' input
+        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe this video.", videos=load_video.out(0), chat_template="default")
+
+        # Generate
+        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=50, seed=42)
+
+        # OmitThink (as requested)
+        omit_think = graph.node("OmitThink", value=generation.out(0))
+
+        # Save output
+        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_video_test")
+
+        workflow = graph.finalize()
+        prompt = Prompt.validate(workflow)
+
+        async with Comfy() as client:
+            outputs = await client.queue_prompt(prompt)
+            
+        # We expect it to fail before this, but if it succeeds, we should check the output
+        assert len(outputs) > 0