diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 63d80d17f..b0294ca23 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -15,7 +15,7 @@ class SiglipMultiheadAttentionPoolingHead(torch.nn.Module): batch_size = hidden_state.shape[0] probe = self.probe.repeat(batch_size, 1, 1) - hidden_state = self.attention(probe, hidden_state, hidden_state)[0] + hidden_state = self.attention(probe, hidden_state, hidden_state) residual = hidden_state hidden_state = self.layernorm(hidden_state) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index eed49269b..3706f4344 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -1164,12 +1164,14 @@ class MultiheadAttentionComfyv(nn.Module): error_msgs, ) - def forward(self, src, attn_mask = None, key_padding_mask = None): + def forward(self, src, k = None, v = None, attn_mask = None, key_padding_mask = None): self._q_proj, self._k_proj, self._v_proj = [t.to(src.device).to(src.dtype) for t in (self._q_proj, self._k_proj, self._v_proj)] q = self._q_proj(src) - k = self._k_proj(src) - v = self._v_proj(src) + if k is None: + k = self._k_proj(src) + if v is None: + v = self._v_proj(src) output = optimized_attention(q, k, v, self.num_heads, mask = attn_mask) return self.out_proj(output) diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index fd3964b35..86632d82b 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -97,7 +97,7 @@ class EncodeVideo(io.ComfyNode): except: out = model.encode(chunk) else: - out = vae.encode_image(chunk) + out = vae.encode_image(chunk, crop=False) out = out["image_embeds"] out_cpu = out.cpu()