From e00ae62907ecaacd0294dc3c5595af88388d239f Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Fri, 13 Mar 2026 20:37:03 +0100 Subject: [PATCH] Restore ViT spatial order after windowed attention. --- comfy/text_encoders/qwen_vl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/text_encoders/qwen_vl.py b/comfy/text_encoders/qwen_vl.py index 3b18ce730..98c350a12 100644 --- a/comfy/text_encoders/qwen_vl.py +++ b/comfy/text_encoders/qwen_vl.py @@ -425,4 +425,7 @@ class Qwen2VLVisionTransformer(nn.Module): hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention) hidden_states = self.merger(hidden_states) + # Potentially important for spatially precise edits. This is present in the HF implementation. + reverse_indices = torch.argsort(window_index) + hidden_states = hidden_states[reverse_indices, :] return hidden_states