From e00ae62907ecaacd0294dc3c5595af88388d239f Mon Sep 17 00:00:00 2001
From: Talmaj Marinc <talmaj@comfy.org>
Date: Fri, 13 Mar 2026 20:37:03 +0100
Subject: [PATCH] Restore ViT spatial order after windowed attention.

---
 comfy/text_encoders/qwen_vl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/comfy/text_encoders/qwen_vl.py b/comfy/text_encoders/qwen_vl.py
index 3b18ce730..98c350a12 100644
--- a/comfy/text_encoders/qwen_vl.py
+++ b/comfy/text_encoders/qwen_vl.py
@@ -425,4 +425,7 @@ class Qwen2VLVisionTransformer(nn.Module):
             hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
 
         hidden_states = self.merger(hidden_states)
+        # Potentially important for spatially precise edits. This is present in the HF implementation.
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
         return hidden_states