Restore ViT spatial order after windowed attention.

This commit is contained in:
Talmaj Marinc 2026-03-13 20:37:03 +01:00 committed by Talmaj Marinc
parent b202f842af
commit e00ae62907

View File

@ -425,4 +425,7 @@ class Qwen2VLVisionTransformer(nn.Module):
hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
hidden_states = self.merger(hidden_states)
# Potentially important for spatially precise edits. This is present in the HF implementation.
reverse_indices = torch.argsort(window_index)
hidden_states = hidden_states[reverse_indices, :]
return hidden_states