improve images and videos support

2026-03-09 11:17:42 +08:00 · 2025-12-03 15:28:56 -08:00 · 2025-12-03 15:28:56 -08:00 · 8e282aea6d
commit 8e282aea6d
parent 4349fac71a
8 changed files with 247 additions and 8 deletions
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -196,6 +196,7 @@ def _create_parser() -> EnhancedConfigArgParser:
    parser.add_argument("--otel-exporter-otlp-endpoint", type=str, default=None, env_var="OTEL_EXPORTER_OTLP_ENDPOINT", help="A base endpoint URL for any signal type, with an optionally-specified port number. Helpful for when you're sending more than one signal to the same endpoint and want one environment variable to control the endpoint.")
    parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
    parser.add_argument("--force-hf-local-dir-mode", action="store_true", help="Download repos from huggingface.co to the models/huggingface directory with the \"local_dir\" argument instead of models/huggingface_cache with the \"cache_dir\" argument, recreating the traditional file structure.")
    parser.add_argument("--enable-video-to-image-fallback", action="store_true", help="Enable fallback to convert video frames to images for models that do not natively support video inputs.")
    parser.add_argument(
        "--front-end-version",
@ -298,6 +299,7 @@ def _create_parser() -> EnhancedConfigArgParser:
        except Exception as exc:
            logger.error("Failed to load custom config plugin", exc_info=exc)
    parser.add_argument("--disable-requests-caching", action="store_true", help="Disable requests caching (useful for testing)")
    return parser
--- a/comfy/cli_args_types.py
+++ b/comfy/cli_args_types.py
@ -250,6 +250,7 @@ class Configuration(dict):
        self.external_address: Optional[str] = None
        self.disable_known_models: bool = False
        self.max_queue_size: int = 65536
        self.disable_requests_caching: bool = False
        self.force_channels_last: bool = False
        self.force_hf_local_dir_mode = False
        self.preview_size: int = 512
@ -290,6 +291,7 @@ class Configuration(dict):
        self.default_device: Optional[int] = None
        self.block_runtime_package_installation = None
        self.enable_eval: Optional[bool] = False
        self.enable_video_to_image_fallback: bool = False
        for key, value in kwargs.items():
            self[key] = value
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@ -30,6 +30,7 @@ from ..model_downloader import get_or_download_huggingface_repo
 from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
 from ..model_management_types import ModelManageableStub
 from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
 from ..cli_args import args
 logger = logging.getLogger(__name__)
@ -519,6 +520,20 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
        except Exception as exc:
            logger.debug("Could not apply chat template", exc_info=exc)
        if isinstance(prompt, list):
            # Fallback: extract text from messages if chat template application failed or wasn't available
            extracted_text = []
            for message in prompt:
                if isinstance(message, dict) and "content" in message:
                    content = message["content"]
                    if isinstance(content, str):
                        extracted_text.append(content)
                    elif isinstance(content, list):
                        for item in content:
                            if isinstance(item, dict) and item.get("type") == "text":
                                extracted_text.append(item.get("text", ""))
            prompt = "\n".join(extracted_text)
        if self.processor is None and isinstance(prompt, str):
            batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
            return {**batch_encoding}
@ -527,15 +542,58 @@ class TransformersManagedModel(ModelManageableStub, LanguageModel):
                self.processor.to(device=self.load_device)
            # convert tuple to list from images.unbind() for paligemma workaround
            image_tensor_list = list(images.unbind()) if images is not None and len(images) > 0 else None
            # Convert videos to list of list of frames (uint8)
            if videos is not None and len(videos) > 0:
                new_videos = []
                for v in videos:
                    # Convert to uint8 0-255 if float
                    if v.dtype == torch.float32 or v.dtype == torch.float16 or v.dtype == torch.bfloat16:
                        v = (v * 255).to(torch.uint8)
                    # Convert (T, H, W, C) tensor to list of (H, W, C) tensors
                    if v.ndim == 4:
                        new_videos.append(list(v))
                    else:
                        new_videos.append([v]) # Fallback if not 4D
                videos = new_videos
            # Check if processor accepts 'videos' argument
            import inspect
            processor_params = inspect.signature(self.processor).parameters
            has_videos_arg = "videos" in processor_params
            kwargs = {
                "text": [prompt],
                "images": image_tensor_list,
                "return_tensors": "pt",
                "padding": True,
            }
            if has_videos_arg:
                kwargs["videos"] = videos
                if "input_data_format" in processor_params:
                     kwargs["input_data_format"] = "channels_last"
            elif videos is not None and len(videos) > 0:
                if args.enable_video_to_image_fallback:
                    # Fallback: flatten video frames into images if processor doesn't support 'videos'
                    # videos is List[List[Frame]] where Frame is (H, W, C)
                    flattened_frames = []
                    for video in videos:
                        flattened_frames.extend(video)
                    # Convert list of frames to list of tensors if needed, or just append to images list
                    # images is currently a list of tensors
                    if kwargs["images"] is None:
                        kwargs["images"] = []
                    # Ensure frames are in the same format as images (tensors)
                    # Frames in videos are already tensors (uint8)
                    kwargs["images"].extend(flattened_frames)
                else:
                    logger.warning(f"Model {self.model.name_or_path} does not support video inputs and video-to-image fallback is disabled. Use --enable-video-to-image-fallback to enable it.")
            try:
-                batch_feature: BatchFeature = self.processor(
+                batch_feature: BatchFeature = self.processor(**kwargs)
                    text=[prompt],
                    images=image_tensor_list,
                    videos=None if videos is not None and len(videos) == 0 or (hasattr(videos, "shape") and videos.shape[0]) == 0 else videos,
                    return_tensors="pt",
                    padding=True,
                    input_data_format="channels_last"  # Ensure this is set for Qwen
                )
            except TypeError as exc_info:
                logger.warning(f"Exception while trying to run processor. Your transformers package is version {transformers.__version__} and may need to be updated")
                raise exc_info
--- a/comfy/node_requests_caching.py
+++ b/comfy/node_requests_caching.py
@ -4,6 +4,7 @@ import pathlib
 import requests_cache
 from contextlib import contextmanager
 from .cli_args import args
@contextmanager
 def use_requests_caching(
@ -35,5 +36,9 @@ def use_requests_caching(
    kwargs.setdefault('use_cache_dir', not path_provided)
    kwargs.setdefault('cache_control', cache_control)
    if args.disable_requests_caching:
        yield
        return
    with requests_cache.enabled(cache_name, **kwargs):
        yield
--- a/tests/inference/test_mixed_media_generic.py
+++ b/tests/inference/test_mixed_media_generic.py
@ -0,0 +1,47 @@
 import pytest
 from comfy_execution.graph_utils import GraphBuilder
 from comfy.client.embedded_comfy_client import Comfy
 from comfy.api.components.schema.prompt import Prompt
 class TestMixedMediaGeneric:
    @pytest.mark.asyncio
    async def test_mixed_media_generic(self):
        graph = GraphBuilder()
        # Load BLIP (small, standard model, image-only processor)
        model_loader = graph.node("TransformersLoader1", ckpt_name="Salesforce/blip-image-captioning-base")
        # Load video (Goat)
        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
        # Use frame cap to keep it light
        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
        # Load image (Worm)
        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
        load_image = graph.node("LoadImageFromURL", value=image_url)
        # Tokenize with both video and image
        # BLIP expects "images" (list of tensors) if we use the processor correctly.
        # My fallback logic should convert video frames to images.
        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="a photography of", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
        # Generate
        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
        # OmitThink
        omit_think = graph.node("OmitThink", value=generation.out(0))
        # Save output
        graph.node("SaveString", value=omit_think.out(0), filename_prefix="mixed_media_test")
        workflow = graph.finalize()
        prompt = Prompt.validate(workflow)
        from comfy.cli_args import default_configuration
        config = default_configuration()
        config.enable_video_to_image_fallback = True
        async with Comfy(configuration=config) as client:
            outputs = await client.queue_prompt(prompt)
        assert len(outputs) > 0
--- a/tests/inference/test_qwen3vl_mixed_media.py
+++ b/tests/inference/test_qwen3vl_mixed_media.py
@ -0,0 +1,45 @@
 import pytest
 from comfy_execution.graph_utils import GraphBuilder
 from comfy.client.embedded_comfy_client import Comfy
 from comfy.api.components.schema.prompt import Prompt
 class TestQwen3VLMixedMedia:
    @pytest.mark.asyncio
    async def test_qwen3vl_mixed_media(self):
        graph = GraphBuilder()
        # Load Qwen3-VL-2B-Instruct
        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen3-VL-2B-Instruct", trust_remote_code=True)
        # Load video (Goat)
        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
        # Use frame cap to keep it light
        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
        # Load image (Worm)
        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
        load_image = graph.node("LoadImageFromURL", value=image_url)
        # Tokenize with both video and image
        # Qwen3-VL likely supports 'videos' input natively like Qwen2-VL
        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
        # Generate
        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
        # OmitThink
        omit_think = graph.node("OmitThink", value=generation.out(0))
        # Save output
        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwen3vl_mixed_media_test")
        workflow = graph.finalize()
        prompt = Prompt.validate(workflow)
        from comfy.cli_args_types import Configuration
        config = Configuration()
        config.disable_requests_caching = True
        async with Comfy(configuration=config) as client:
            outputs = await client.queue_prompt(prompt)
        assert len(outputs) > 0
--- a/tests/inference/test_qwenvl_mixed_media.py
+++ b/tests/inference/test_qwenvl_mixed_media.py
@ -0,0 +1,41 @@
 import pytest
 from comfy_execution.graph_utils import GraphBuilder
 from comfy.client.embedded_comfy_client import Comfy
 from comfy.api.components.schema.prompt import Prompt
 class TestQwenVLMixedMedia:
    @pytest.mark.asyncio
    async def test_qwenvl_mixed_media(self):
        graph = GraphBuilder()
        # Load Qwen2-VL-2B-Instruct
        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
        # Load video (Goat)
        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
        # Use frame cap to keep it light
        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
        # Load image (Worm)
        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/Earthworm.jpg/330px-Earthworm.jpg"
        load_image = graph.node("LoadImageFromURL", value=image_url)
        # Tokenize with both video and image
        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe what you see in the video and the image.", videos=load_video.out(0), images=load_image.out(0), chat_template="default")
        # Generate
        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=100, seed=42)
        # OmitThink
        omit_think = graph.node("OmitThink", value=generation.out(0))
        # Save output
        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_mixed_media_test")
        workflow = graph.finalize()
        prompt = Prompt.validate(workflow)
        async with Comfy() as client:
            outputs = await client.queue_prompt(prompt)
        assert len(outputs) > 0
--- a/tests/inference/test_qwenvl_video.py
+++ b/tests/inference/test_qwenvl_video.py
@ -0,0 +1,39 @@
 import pytest
 from comfy_execution.graph_utils import GraphBuilder
 from comfy.client.embedded_comfy_client import Comfy
 from comfy.api.components.schema.prompt import Prompt
 class TestQwenVLVideo:
    @pytest.mark.asyncio
    async def test_qwenvl_video_loading(self):
        graph = GraphBuilder()
        # Load QwenVL model (using a small one as requested)
        # Qwen/Qwen2-VL-2B-Instruct is a good candidate for a "small" QwenVL model
        model_loader = graph.node("TransformersLoader1", ckpt_name="Qwen/Qwen2-VL-2B-Instruct")
        # Load video from URL with frame cap to avoid OOM
        video_url = "https://upload.wikimedia.org/wikipedia/commons/f/f7/2024-04-05_Luisenpark_MA_Ziegen_2.webm"
        load_video = graph.node("LoadVideoFromURL", value=video_url, frame_load_cap=16, select_every_nth=10)
        # Tokenize with video
        # OneShotInstructTokenize has optional 'videos' input
        tokenizer = graph.node("OneShotInstructTokenize", model=model_loader.out(0), prompt="Describe this video.", videos=load_video.out(0), chat_template="default")
        # Generate
        generation = graph.node("TransformersGenerate", model=model_loader.out(0), tokens=tokenizer.out(0), max_new_tokens=50, seed=42)
        # OmitThink (as requested)
        omit_think = graph.node("OmitThink", value=generation.out(0))
        # Save output
        graph.node("SaveString", value=omit_think.out(0), filename_prefix="qwenvl_video_test")
        workflow = graph.finalize()
        prompt = Prompt.validate(workflow)
        async with Comfy() as client:
            outputs = await client.queue_prompt(prompt)
        # We expect it to fail before this, but if it succeeds, we should check the output
        assert len(outputs) > 0