From f3d0b070f2f9926eefe81bc5623c3ce20510d8fc Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Tue, 2 Jun 2026 10:35:58 +0300
Subject: [PATCH] Cleanup

---
 comfy/ldm/wan/model.py        | 10 +++-------
 comfy/model_base.py           |  2 +-
 comfy_extras/nodes_bernini.py | 15 ++++++++++-----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 394b71d08..8e2116a6c 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -570,7 +570,7 @@ class WanModel(torch.nn.Module):
                 full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
                 x = torch.concat((full_ref, x), dim=1)
 
-        # In-context reference streams (Bernini)
+        # In-context reference (Bernini)
         context_latents = kwargs.get("context_latents", None)
         main_len = x.shape[1]
         if context_latents is not None:
@@ -650,10 +650,7 @@ class WanModel(torch.nn.Module):
 
         freqs = self.rope_embedder(img_ids).movedim(1, 2)
 
-        # In-context reference conditioning (e.g. Bernini): a non-zero source_id
-        # composes an extra rotation (over the full head_dim) into the spatial
-        # rope so streams sharing the same spatial coords stay distinct. source_id
-        # 0 is identity, so this is a no-op for all normal Wan usage.
+        # In-context reference: a non-zero source_id composes an extra rotation into the spatial rope
         if source_id:
             d = self.dim // self.num_heads
             pos = torch.tensor([[float(source_id)]], device=freqs.device, dtype=torch.float32)
@@ -683,8 +680,7 @@ class WanModel(torch.nn.Module):
 
         freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
 
-        # In-context reference streams: one rope block per stream, each with its
-        # own source_id (1, 2, ...) so they stay distinct from the target (id 0).
+        # In-context reference: one rope block per stream, each with it's own source_id (1, 2, ...) to distinguish from the target (id 0).
         context_latents = kwargs.get("context_latents", None)
         if context_latents is not None:
             context_latents = [comfy.ldm.common_dit.pad_to_patch_size(lat, self.patch_size) for lat in context_latents]
diff --git a/comfy/model_base.py b/comfy/model_base.py
index f5224a840..3742062ce 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1525,7 +1525,7 @@ class WAN21(BaseModel):
         return out
 
     def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]):
-        # In-context streams slicing (Bernini)
+        # In-context cond slicing (Bernini)
         if cond_key == "context_latents" and isinstance(getattr(cond_value, "cond", None), list):
             dim = window.dim
             out = []
diff --git a/comfy_extras/nodes_bernini.py b/comfy_extras/nodes_bernini.py
index c29af857e..4de3460bb 100644
--- a/comfy_extras/nodes_bernini.py
+++ b/comfy_extras/nodes_bernini.py
@@ -53,10 +53,15 @@ class BerniniConditioning(io.ComfyNode):
                 io.Int.Input("height", default=480, min=16, max=8192, step=16),
                 io.Int.Input("length", default=81, min=1, max=8192, step=4),
                 io.Int.Input("batch_size", default=1, min=1, max=4096),
-                io.Image.Input("source_video", optional=True, tooltip="Source video to edit/restyle (task v2v or rv2v). Resized to width/height and trimmed to length. Acts as the edit base / canvas."),
-                io.Image.Input("reference_video", optional=True, tooltip="Moving content to composite into the source video (video insertion / ads2v), e.g. a clip to play on a screen. Kept at native aspect (long edge capped at ref_max_size), trimmed to length."),
-                io.Image.Input("reference_images", optional=True, tooltip="Reference image(s) injected as in-context tokens (task r2v or rv2v). Each is kept at its native aspect ratio, long edge capped at ref_max_size."),
-                io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True),
+                io.Image.Input("source_video", optional=True, tooltip=(
+                    "Source video to edit/restyle (task v2v or rv2v). Resized to width/height and trimmed to length. Acts as the edit base / canvas.")),
+                io.Image.Input("reference_video", optional=True, tooltip=(
+                    "Moving content to composite into the source video (video insertion / ads2v),"
+                    "e.g. a clip to play on a screen. Kept at native aspect (long edge capped at ref_max_size), trimmed to length.")),
+                io.Image.Input("reference_images", optional=True, tooltip=(
+                    "Reference image(s) injected as in-context tokens (task r2v or rv2v). Each is kept at its native aspect ratio, long edge capped at ref_max_size.")),
+                io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True, tooltip=(
+                    "Max size for the long edge of reference_video and reference_images. Resized with preserved aspect ratio, snapped to 16px, and no upscaling.")),
             ],
             outputs=[
                 io.Conditioning.Output(display_name="positive"),
@@ -72,7 +77,7 @@ class BerniniConditioning(io.ComfyNode):
                              device=comfy.model_management.intermediate_device())
 
         # Ordered list of condition streams -> source_id by list order:
-        #   source_video (1), reference_video (2), reference_images (3, 4, ...).
+        # source_video (1), reference_video (2), reference_images (3, 4, ...).
         context = []
         if source_video is not None:
             vid = comfy.utils.common_upscale(source_video[:length, :, :, :3].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)