Speed up ernie model by a bit on nvidia and use higher quality rope. (#14192)

This commit is contained in:
comfyanonymous 2026-05-30 17:53:37 -07:00 committed by GitHub
parent ea73d3b2ea
commit 81aa5a38b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 14 deletions

View File

@ -14,6 +14,7 @@ from torchvision import transforms
import comfy.patcher_extension
from comfy.ldm.modules.attention import optimized_attention
import comfy.ldm.common_dit
import comfy.quant_ops
# ---------------------- Feed Forward Network -----------------------

View File

@ -5,6 +5,7 @@ import torch.nn.functional as F
from comfy.ldm.modules.attention import optimized_attention
import comfy.model_management
import comfy.quant_ops
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
assert dim % 2 == 0
@ -19,15 +20,6 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
return out.to(dtype=torch.float32, device=pos.device)
def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
rot_dim = freqs_cis.shape[-1]
x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
cos_ = freqs_cis[0]
sin_ = freqs_cis[1]
x1, x2 = x.chunk(2, dim=-1)
x_rotated = torch.cat((-x2, x1), dim=-1)
return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
class ErnieImageEmbedND3(nn.Module):
def __init__(self, dim: int, theta: int, axes_dim: tuple):
super().__init__()
@ -37,8 +29,16 @@ class ErnieImageEmbedND3(nn.Module):
def forward(self, ids: torch.Tensor) -> torch.Tensor:
emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2]
return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim]
cos_ = emb[0]
sin_ = emb[1]
N = cos_.shape[-1]
half = N // 2
cos_top = cos_[..., :half].repeat_interleave(2, dim=-1)
sin_top = sin_[..., :half].repeat_interleave(2, dim=-1)
cos_bot = cos_[..., half:].repeat_interleave(2, dim=-1)
sin_bot = sin_[..., half:].repeat_interleave(2, dim=-1)
rot = torch.stack([cos_top, -sin_top, sin_bot, cos_bot], dim=-1)
return rot.reshape(*rot.shape[:-1], 2, 2).unsqueeze(2)
class ErnieImagePatchEmbedDynamic(nn.Module):
def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
@ -115,8 +115,7 @@ class ErnieImageAttention(nn.Module):
key = self.norm_k(key)
if image_rotary_emb is not None:
query = apply_rotary_emb(query, image_rotary_emb)
key = apply_rotary_emb(key, image_rotary_emb)
query, key = comfy.quant_ops.ck.apply_rope_split_half(query, key, image_rotary_emb)
q_flat = query.reshape(B, S, -1)
k_flat = key.reshape(B, S, -1)
@ -274,7 +273,7 @@ class ErnieImageModel(nn.Module):
image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1))
del image_ids, text_ids
sample = self.time_proj(timesteps).to(dtype)