From f1a5f6f5b36f8cd46cc8d7bd4c379d7d08935ac3 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Thu, 27 Nov 2025 17:53:37 +0200 Subject: [PATCH] Rever RoPE scaling to simpler one --- comfy/ldm/kandinsky5/model.py | 23 +++++++---------------- comfy/model_base.py | 2 +- comfy_extras/nodes_kandinsky5.py | 6 +++--- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/comfy/ldm/kandinsky5/model.py b/comfy/ldm/kandinsky5/model.py index 50d3ec5cd..a78b9421e 100644 --- a/comfy/ldm/kandinsky5/model.py +++ b/comfy/ldm/kandinsky5/model.py @@ -307,23 +307,14 @@ class Kandinsky5(nn.Module): h_start += rope_options.get("shift_y", 0.0) w_start += rope_options.get("shift_x", 0.0) else: - if self.model_dim == 4096: # pro video model,this is experimental as the original code only had two fixed scales for 512p and 1024p - spatial_size = h * w - scale_16384 = (1.0, 3.16, 3.16) - scale_9216 = (1.0, 2.0, 2.0) - if spatial_size <= 6144: - rope_scale_factor = scale_9216 - elif spatial_size >= 14080: - rope_scale_factor = scale_16384 - else: - t = (spatial_size - 14080) / (6144 - 14080) - rope_scale_factor = tuple(a + (b - a) * t for a, b in zip(scale_16384, scale_9216)) - else: - rope_scale_factor = self.rope_scale_factor + rope_scale_factor = self.rope_scale_factor + if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions + if h * w >= 14080: + rope_scale_factor = (1.0, 3.16, 3.16) - t_len = (t_len - 1.0) // rope_scale_factor[0] + 1.0 - h_len = (h_len - 1.0) // rope_scale_factor[1] + 1.0 - w_len = (w_len - 1.0) // rope_scale_factor[2] + 1.0 + t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0 + h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0 + w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0 img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) diff --git a/comfy/model_base.py b/comfy/model_base.py index 43d7fb281..5ae9ca340 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1690,6 +1690,6 @@ class Kandinsky5_image(Kandinsky5): def concat_cond(self, **kwargs): return None - + def process_latent_out(self, latent): # input is still 5D, return single frame to decode with Flux VAE return self.latent_format.process_out(latent)[:, :, 0] diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py index cb2d83595..ff46398d5 100644 --- a/comfy_extras/nodes_kandinsky5.py +++ b/comfy_extras/nodes_kandinsky5.py @@ -67,7 +67,7 @@ def adaptive_mean_std_normalization(source, reference): # normalization normalized = (source - source_mean) / (source_std + 1e-8) normalized = normalized * reference_std + reference_mean - + return normalized @@ -97,9 +97,9 @@ class NormalizeVideoLatentFrames(io.ComfyNode): first_frames = samples[:, :, :frames_to_normalize] reference_frames_data = samples[:, :, frames_to_normalize:frames_to_normalize+min(reference_frames, samples.shape[2]-frames_to_normalize)] - + normalized_first_frames = adaptive_mean_std_normalization(first_frames, reference_frames_data) - + samples[:, :, :frames_to_normalize] = normalized_first_frames s["samples"] = samples return io.NodeOutput(s)