diff --git a/comfy/language/language_types.py b/comfy/language/language_types.py
index 448a67d1c..7a9fe376e 100644
--- a/comfy/language/language_types.py
+++ b/comfy/language/language_types.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Union, Callable, List, Optional, Protocol, runtime_checkable
+from typing import Union, Callable, List, Optional, Protocol, runtime_checkable, Literal
 
 import numpy as np
 import torch
@@ -63,6 +63,27 @@ LLaVAProcessor = Callable[
 ]
 
 
+class LanguageMessage(TypedDict):
+    role: Literal["system", "user", "assistant"]
+    content: str | MessageContent
+
+
+class MessageContentImage(TypedDict):
+    url: NotRequired[str]
+
+
+class MessageContent(TypedDict):
+    type: Literal["text", "image", "video", "image_url"]
+    text: NotRequired[str]
+    image: NotRequired[str]
+    image_url: NotRequired[MessageContentImage]
+    min_pixels: NotRequired[int]
+    max_pixels: NotRequired[int]
+
+
+LanguagePrompt = list[LanguageMessage]
+
+
 @runtime_checkable
 class LanguageModel(Protocol):
     @staticmethod
@@ -78,7 +99,7 @@ class LanguageModel(Protocol):
                  **kwargs) -> str:
         ...
 
-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
         ...
 
     @property
diff --git a/comfy/language/transformers_model_management.py b/comfy/language/transformers_model_management.py
index 9ee4a53c5..f4db3800d 100644
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@@ -7,7 +7,7 @@ import operator
 import pathlib
 import warnings
 from functools import reduce
-from typing import Optional, Any, Callable, List
+from typing import Optional, Any, Callable
 
 import torch
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, AutoProcessor, AutoTokenizer, \
@@ -18,12 +18,13 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPIN
 
 from .chat_templates import KNOWN_CHAT_TEMPLATES
 from .language_types import ProcessorResult, TOKENS_TYPE, GENERATION_KWARGS_TYPE, TransformerStreamedProgress, \
-    LLaVAProcessor, LanguageModel
+    LLaVAProcessor, LanguageModel, LanguagePrompt
 from .. import model_management
+from ..component_model.tensor_types import RGBImageBatch
 from ..model_downloader import get_or_download_huggingface_repo
 from ..model_management import unet_offload_device, get_torch_device, unet_dtype, load_models_gpu
 from ..model_management_types import ModelManageable
-from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block
+from ..utils import comfy_tqdm, ProgressBar, comfy_progress, seed_for_block, tensor2pil
 
 
 class TransformersManagedModel(ModelManageable, LanguageModel):
@@ -54,8 +55,9 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
         if subfolder is not None and subfolder != "":
             hub_kwargs["subfolder"] = subfolder
         repo_id = ckpt_name
-        ckpt_name = get_or_download_huggingface_repo(ckpt_name)
         with comfy_tqdm():
+            ckpt_name = get_or_download_huggingface_repo(ckpt_name)
+
             from_pretrained_kwargs = {
                 "pretrained_model_name_or_path": ckpt_name,
                 "trust_remote_code": True,
@@ -323,7 +325,7 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
         if processor is not None and hasattr(processor, "image_processor") and hasattr(processor.image_processor, "do_rescale"):
             processor.image_processor.do_rescale = False
 
-    def tokenize(self, prompt: str, images: List[torch.Tensor] | torch.Tensor, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
         tokenizer = self.tokenizer
         assert tokenizer is not None
         assert hasattr(tokenizer, "decode")
@@ -335,32 +337,57 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
             if len(candidate_chat_templates) > 0:
                 filename, chat_template = candidate_chat_templates[0]
                 logging.debug(f"Selected chat template filename={filename} for {self.model.name_or_path}")
+        if isinstance(images, list):
+            images = torch.stack(images, dim=0)
+        if images is not None:
+            # PIL it for the sake of simplicity
+            image_sizes = [(image.shape[-2], image.shape[-3]) for image in images]
+        else:
+            image_sizes = []
+            images = []
+
         try:
             if hasattr(tokenizer, "apply_chat_template"):
-                # todo: this should come from node inputs
-                prompt = tokenizer.apply_chat_template([
-                    {"role": "user", "content": prompt},
-                ], chat_template=chat_template, add_generation_prompt=True, tokenize=False)
+                messages: LanguagePrompt
+                if isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict):
+                    messages = prompt
+                elif "content[" in chat_template:
+                    messages = [
+                        {"role": "user",
+                         "content": [
+                                        {
+                                            "type": "text",
+                                            "text": prompt
+                                        }
+                                    ] + [
+                                        {"type": "image"} for _ in range(len(images))
+                                    ]
+
+                         }
+                    ]
+                else:
+                    messages = [
+                        {"role": "user", "content": prompt},
+                    ]
+                prompt = tokenizer.apply_chat_template(messages, chat_template=chat_template, add_generation_prompt=True, tokenize=False)
         except Exception as exc:
             logging.debug("Could not apply chat template", exc_info=exc)
 
-        if self.processor is None:
+        if self.processor is None and isinstance(prompt, str):
             batch_encoding = tokenizer(prompt, return_tensors="pt").to(device=self.load_device)
             return {**batch_encoding}
         else:
-            assert images is not None and len(images) > 0, "When using a multi-modal model, pass at least one, possibly empty, image"
             if hasattr(self.processor, "to"):
                 self.processor.to(device=self.load_device)
 
-            assert "<image>" in prompt.lower(), "You must specify a &lt;image&gt; token inside the prompt for it to be substituted correctly by a HuggingFace processor"
-            batch_feature: BatchFeature = self.processor([prompt], images=images.unbind(), padding=True, return_tensors="pt")
+            batch_feature: BatchFeature = self.processor(text=[prompt], images=images.unbind(), return_tensors="pt", padding=True)
             if hasattr(self.processor, "to"):
                 self.processor.to(device=self.offload_device)
             assert "input_ids" in batch_feature
             batch_feature.to(device=self.load_device, dtype=self.model_dtype())
             # noinspection PyTypeChecker
             return {
-                "image_sizes": [(images.shape[-1], image.shape[-2]) for image in images],
+                "image_sizes": image_sizes,
                 "images": batch_feature["pixel_values"],
                 "inputs": batch_feature["input_ids"],
                 **batch_feature
diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py
index 5f78da20e..4a6d18dea 100644
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@@ -423,6 +423,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
     'facebook/nllb-200-distilled-1.3B',
     'THUDM/chatglm3-6b',
     'roborovski/superprompt-v1',
+    'Qwen/Qwen2-VL-7B-Instruct',
 }
 
 KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
diff --git a/comfy/nodes/base_nodes.py b/comfy/nodes/base_nodes.py
index 1813dab35..c61258d8a 100644
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@@ -7,9 +7,9 @@ import math
 import random
 import logging
 
-from PIL import Image, ImageOps, ImageSequence, ImageFile
+from PIL import Image, ImageOps, ImageSequence
 from PIL.PngImagePlugin import PngInfo
-from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub import snapshot_download
 from natsort import natsorted
 import numpy as np
 import safetensors.torch
diff --git a/comfy/utils.py b/comfy/utils.py
index d410c59f5..81827a2ee 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -948,3 +948,11 @@ def seed_for_block(seed):
         np.random.set_state(numpy_rng_state)
         if torch.cuda.is_available():
             torch.cuda.set_rng_state_all(cuda_rng_state)
+
+
+def pil2tensor(image: Image) -> torch.Tensor:
+    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
+
+
+def tensor2pil(t_image: torch.Tensor) -> Image:
+    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
diff --git a/comfy_extras/nodes/nodes_openai.py b/comfy_extras/nodes/nodes_openai.py
index 325808286..a9d79cd73 100644
--- a/comfy_extras/nodes/nodes_openai.py
+++ b/comfy_extras/nodes/nodes_openai.py
@@ -14,7 +14,7 @@ from openai.types.chat import ChatCompletionMessageParam
 from comfy.cli_args import args
 from comfy.component_model.tensor_types import RGBImageBatch
 from comfy.language.language_types import LanguageModel, ProcessorResult, GENERATION_KWARGS_TYPE, TOKENS_TYPE, \
-    TransformerStreamedProgress
+    TransformerStreamedProgress, LanguagePrompt
 from comfy.nodes.package_typing import CustomNode, InputTypes
 from comfy.utils import comfy_progress, ProgressBar, seed_for_block
 
@@ -120,7 +120,7 @@ class OpenAILanguageModelWrapper(LanguageModel):
 
         return full_response
 
-    def tokenize(self, prompt: str, images: RGBImageBatch, chat_template: str | None = None) -> ProcessorResult:
+    def tokenize(self, prompt: str | LanguagePrompt, images: RGBImageBatch | None, chat_template: str | None = None) -> ProcessorResult:
         # OpenAI API doesn't require explicit tokenization, so we'll just return the prompt and images as is
         return {
             "inputs": [prompt],
diff --git a/comfy_extras/nodes/nodes_svg.py b/comfy_extras/nodes/nodes_svg.py
index 78e50e5dc..7099c7555 100644
--- a/comfy_extras/nodes/nodes_svg.py
+++ b/comfy_extras/nodes/nodes_svg.py
@@ -7,6 +7,7 @@ import vtracer
 from PIL import Image
 
 from comfy.nodes.package_typing import CustomNode
+from comfy.utils import tensor2pil
 
 
 def RGB2RGBA(image: Image, mask: Image) -> Image:
@@ -14,14 +15,6 @@ def RGB2RGBA(image: Image, mask: Image) -> Image:
     return Image.merge('RGBA', (R, G, B, mask.convert('L')))
 
 
-def pil2tensor(image: Image) -> torch.Tensor:
-    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
-
-
-def tensor2pil(t_image: torch.Tensor) -> Image:
-    return Image.fromarray(np.clip(255.0 * t_image.cpu().numpy().squeeze(), 0, 255).astype(np.uint8))
-
-
 class ImageToSVG(CustomNode):
     @classmethod
     def INPUT_TYPES(cls):
diff --git a/tests/inference/workflows/qwen2-vl-0.json b/tests/inference/workflows/qwen2-vl-0.json
new file mode 100644
index 000000000..e3693da0d
--- /dev/null
+++ b/tests/inference/workflows/qwen2-vl-0.json
@@ -0,0 +1,106 @@
+{
+  "4": {
+    "inputs": {
+      "ckpt_name": [
+        "20",
+        0
+      ],
+      "subfolder": ""
+    },
+    "class_type": "TransformersLoader",
+    "_meta": {
+      "title": "TransformersLoader"
+    }
+  },
+  "5": {
+    "inputs": {
+      "value": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/43/Cute_dog.jpg/320px-Cute_dog.jpg",
+      "name": "",
+      "title": "Image to query",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  },
+  "7": {
+    "inputs": {
+      "prompt": "Describe the contents of this image.",
+      "chat_template": "default",
+      "model": [
+        "4",
+        0
+      ],
+      "images": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "OneShotInstructTokenize",
+    "_meta": {
+      "title": "OneShotInstructTokenize"
+    }
+  },
+  "9": {
+    "inputs": {
+      "max_new_tokens": 512,
+      "repetition_penalty": 0,
+      "seed": 2598326659,
+      "__tokens": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting.<|im_end|>",
+      "model": [
+        "4",
+        0
+      ],
+      "tokens": [
+        "7",
+        0
+      ]
+    },
+    "class_type": "TransformersGenerate",
+    "_meta": {
+      "title": "TransformersGenerate"
+    }
+  },
+  "11": {
+    "inputs": {
+      "value": [
+        "9",
+        0
+      ],
+      "filename_prefix": "ComfyUI",
+      "extension": ".txt",
+      "output": "The image features a small brown and white puppy sitting on the grass. The puppy has floppy ears and is looking directly at the camera. Behind the puppy, there is a patch of purple flowers, adding a touch of color to the scene. The overall atmosphere of the image is cute and charming, with the puppy appearing to be in a peaceful outdoor setting."
+    },
+    "class_type": "SaveString",
+    "_meta": {
+      "title": "SaveString"
+    }
+  },
+  "20": {
+    "inputs": {
+      "value": "Qwen/Qwen2-VL-7B-Instruct",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "StringEnumRequestParameter",
+    "_meta": {
+      "title": "StringEnumRequestParameter"
+    }
+  },
+  "21": {
+    "inputs": {
+      "images": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "PreviewImage",
+    "_meta": {
+      "title": "Preview Image"
+    }
+  }
+}
\ No newline at end of file