From c951e8f51336928601c6cd2488ef0f8110471873 Mon Sep 17 00:00:00 2001
From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com>
Date: Sat, 27 Sep 2025 14:11:37 +0300
Subject: [PATCH] .

---
 comfy/text_encoders/llama.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index eeb66ca76..3615cfe97 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -1,9 +1,11 @@
 import torch
 import torch.nn as nn
+from dataclasses import dataclass
 from typing import Optional, Any
 import math
 
 from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.model_management
 import comfy.ldm.common_dit
 
 import comfy.model_management
@@ -401,12 +403,18 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
         for e in embeds_info:
             if e.get("type") == "image":
                 grid = e.get("extra", None)
+                position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
                 start = e.get("index")
+                position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
                 end = e.get("size") + start
                 len_max = int(grid.max()) // 2
                 start_next = len_max + start
+                position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device)
+                position_ids[0, start:end] = start
                 max_d = int(grid[0][1]) // 2
+                position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
                 max_d = int(grid[0][2]) // 2
+                position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
 
         if grid is None:
             position_ids = None