feat: Add Bernini-R model support (Wan video) (CORE-279) (#14216)

2026-07-21 23:41:28 +08:00 · 2026-06-10 07:47:34 +08:00
parent ad564899d3
commit f8e51b674c
4 changed files with 163 additions and 2 deletions
@@ -8,7 +8,7 @@ from einops import rearrange

 from comfy.ldm.modules.attention import optimized_attention
 from comfy.ldm.flux.layers import EmbedND
-from comfy.ldm.flux.math import apply_rope1
+from comfy.ldm.flux.math import apply_rope1, rope
 import comfy.ldm.common_dit
 import comfy.model_management
 import comfy.patcher_extension
@@ -570,6 +570,14 @@ class WanModel(torch.nn.Module):
                full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
                x = torch.concat((full_ref, x), dim=1)

+        # In-context reference (Bernini)
+        context_latents = kwargs.get("context_latents", None)
+        main_len = x.shape[1]
+        if context_latents is not None:
+            for lat in context_latents:
+                cl = self.patch_embedding(lat.float().to(x.device)).to(x.dtype).flatten(2).transpose(1, 2)
+                x = torch.cat([x, cl], dim=1)
+
        # context
        context = self.text_embedding(context)

@@ -599,6 +607,9 @@ class WanModel(torch.nn.Module):
        # head
        x = self.head(x, e)

+        if context_latents is not None:
+            x = x[:, :main_len]
+
        if full_ref is not None:
            x = x[:, full_ref.shape[1]:]

@@ -606,7 +617,7 @@ class WanModel(torch.nn.Module):
        x = self.unpatchify(x, grid_sizes)
        return x

-    def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}):
+    def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}, source_id=0):
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
@@ -638,6 +649,13 @@ class WanModel(torch.nn.Module):
        img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])

        freqs = self.rope_embedder(img_ids).movedim(1, 2)
+
+        # In-context reference: a non-zero source_id composes an extra rotation into the spatial rope
+        if source_id:
+            d = self.dim // self.num_heads
+            pos = torch.tensor([[float(source_id)]], device=freqs.device, dtype=torch.float32)
+            id_rot = rope(pos, d, self.rope_embedder.theta).reshape(1, 1, 1, d // 2, 2, 2).to(freqs.dtype)
+            freqs = torch.einsum('...ij,...jk->...ik', freqs, id_rot)
        return freqs

    def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs):
@@ -661,6 +679,15 @@ class WanModel(torch.nn.Module):
            t_len += 1

        freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
+
+        # In-context reference: one rope block per stream, each with it's own source_id (1, 2, ...) to distinguish from the target (id 0).
+        context_latents = kwargs.get("context_latents", None)
+        if context_latents is not None:
+            context_latents = [comfy.ldm.common_dit.pad_to_patch_size(lat, self.patch_size) for lat in context_latents]
+            for i, lat in enumerate(context_latents):
+                freqs = torch.cat([freqs, self.rope_encode(lat.shape[-3], lat.shape[-2], lat.shape[-1], device=x.device, dtype=x.dtype, transformer_options=transformer_options, source_id=i + 1)], dim=1)
+            kwargs = {**kwargs, "context_latents": context_latents}
+
        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]

    def unpatchify(self, x, grid_sizes):
@@ -1518,8 +1518,26 @@ class WAN21(BaseModel):
        if reference_latents is not None:
            out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])[:, :, 0])

+        # In-context reference conditioning (Bernini)
+        context_latents = kwargs.get("context_latents", None)
+        if context_latents is not None:
+            out['context_latents'] = comfy.conds.CONDList([self.process_latent_in(l) for l in context_latents])
+
        return out

+    def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]):
+        # In-context cond slicing (Bernini)
+        if cond_key == "context_latents" and isinstance(getattr(cond_value, "cond", None), list):
+            dim = window.dim
+            out = []
+            for lat in cond_value.cond:
+                if lat.ndim > dim and lat.shape[dim] > 1 and lat.shape[dim] == x_in.shape[dim]:
+                    out.append(window.get_tensor(lat, device, dim=dim, retain_index_list=retain_index_list))
+                else:
+                    out.append(lat.to(device))
+            return cond_value._copy_with(out)
+        return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
+

 class WAN21_CausalAR(WAN21):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -0,0 +1,115 @@
+import torch
+from typing_extensions import override
+
+import comfy.model_management
+import comfy.utils
+import node_helpers
+from comfy_api.latest import ComfyExtension, io
+
+
+def _resize_long_edge(image, max_size, stride=16):
+    """Resize (preserve aspect) so the long edge <= max_size, then snap each side to `stride`"""
+    h, w = image.shape[1], image.shape[2]
+    scale = min(max_size / max(h, w), 1.0)
+    nh = max(stride, round(h * scale / stride) * stride)
+    nw = max(stride, round(w * scale / stride) * stride)
+    return comfy.utils.common_upscale(image[:, :, :, :3].movedim(-1, 1), nw, nh, "area", "disabled").movedim(1, -1)
+
+
+class BerniniConditioning(io.ComfyNode):
+    """Bernini in-context conditioning for a Wan2.2-A14B model.
+
+    Attaches the VAE-encoded source video / reference images to the conditioning
+    source video first, then each reference image
+
+    The task is inferred from which inputs are connected:
+      (nothing)                  -> t2v (text-to-video)
+      source_video               -> v2v (video-to-video)
+      source_video + ref_images  -> rv2v (reference-guided video editing)
+      ref_images only            -> r2v (reference-to-video)
+      source_video + ref_video   -> ads2v (insert image/video into video)
+
+    source_video is the edit base / canvas (resized to width x height).
+    reference_video is moving content to composite in.
+    Streams are ordered source_video, reference_video, then reference_images -> source_id (1, 2, 3, ...).
+    """
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="BerniniConditioning",
+            display_name="Bernini Conditioning",
+            category="conditioning/video_models",
+            description="Conditioning node for Bernini in-context video/image conditioning. It can be used for the following tasks: t2v (text-to-video), v2v (video-to-video), rv2v (reference-guided video editing), r2v (reference-to-video), ads2v (insert image/video into video)."
+                         "Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=832, min=16, max=8192, step=16),
+                io.Int.Input("height", default=480, min=16, max=8192, step=16),
+                io.Int.Input("length", default=81, min=1, max=8192, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("source_video", optional=True, tooltip=(
+                    "Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
+                io.Image.Input("reference_video", optional=True, tooltip=(
+                    "Video to insert into the source video (ads2v).")),
+                io.Autogrow.Input("reference_images", optional=True,
+                    template=io.Autogrow.TemplatePrefix(
+                        input=io.Image.Input("reference_image", tooltip=(
+                            "Reference image injected as an in-context token (r2v, rv2v).")),
+                        prefix="reference_image_", min=0, max=8)),
+                io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True, tooltip=(
+                    "Max size for the long edge of reference_video and reference_images. Resized with preserved aspect ratio and snapped to 16px.")),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, width, height, length, batch_size,
+                source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
+        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
+                             device=comfy.model_management.intermediate_device())
+
+        # source_video (1), reference_video (2), reference_images (3, 4, ...).
+        context = []
+        if source_video is not None:
+            vid = comfy.utils.common_upscale(source_video[:length, :, :, :3].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
+            context.append(vae.encode(vid[:, :, :, :3]))
+
+        if reference_video is not None:
+            ref_vid = _resize_long_edge(reference_video[:length], ref_max_size)  # moving content, native aspect
+            context.append(vae.encode(ref_vid[:, :, :, :3]))
+
+        # reference_images is an autogrow dict {reference_image_0: IMAGE, ...}; each slot is a
+        # separate stream at its own native aspect (a multi-image batch in one slot -> one stream per frame).
+        if reference_images:
+            for name in sorted(reference_images):
+                imgs = reference_images[name]
+                if imgs is None:
+                    continue
+                for i in range(imgs.shape[0]):
+                    img = _resize_long_edge(imgs[i:i + 1], ref_max_size)  # native aspect per ref
+                    context.append(vae.encode(img[:, :, :, :3]))
+
+        if context:
+            positive = node_helpers.conditioning_set_values(positive, {"context_latents": context})
+            negative = node_helpers.conditioning_set_values(negative, {"context_latents": context})
+
+        return io.NodeOutput(positive, negative, {"samples": latent})
+
+
+class BerniniExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            BerniniConditioning,
+        ]
+
+
+async def comfy_entrypoint() -> BerniniExtension:
+    return BerniniExtension()
@@ -2404,6 +2404,7 @@ async def init_builtin_extra_nodes():
        "nodes_video.py",
        "nodes_lumina2.py",
        "nodes_wan.py",
+        "nodes_bernini.py",
        "nodes_lotus.py",
        "nodes_hunyuan3d.py",
        "nodes_primitive.py",