diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index d3fdfe3eb..5a792e4bd 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1252,6 +1252,7 @@ class WanModel_S2V(WanModel):
             num_embeds = x.shape[-3] * 4
             audio_emb_global, audio_emb = self.casual_audio_encoder(audio_embed[:, :, :, :num_embeds])
         else:
+            audio_emb_global = None
             audio_emb = None
 
         # embeddings
@@ -1311,7 +1312,7 @@ class WanModel_S2V(WanModel):
                 x = out["img"]
             else:
                 x = block(x, e=e0, freqs=freqs, context=context)
-            if audio_emb is not None:
+            if audio_emb is not None and audio_emb_global is not None:
                 x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len)
         # head
         x = self.head(x, e)