diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py index 93d3782f1..7e1be5f82 100755 --- a/comfy/audio_encoders/whisper.py +++ b/comfy/audio_encoders/whisper.py @@ -3,8 +3,9 @@ import torch.nn as nn import torch.nn.functional as F import torchaudio from typing import Optional -from comfy.ldm.modules.attention import optimized_attention_masked -import comfy.ops +from ..ldm.modules.attention import optimized_attention_masked +from .. import ops + class WhisperFeatureExtractor(nn.Module): def __init__(self, n_mels=128, device=None): @@ -66,11 +67,11 @@ class MultiHeadAttention(nn.Module): self.out_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device) def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - mask: Optional[torch.Tensor] = None, + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: batch_size, seq_len, _ = query.shape @@ -96,9 +97,9 @@ class EncoderLayer(nn.Module): self.final_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device) def forward( - self, - x: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None + self, + x: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None ) -> torch.Tensor: residual = x x = self.self_attn_layer_norm(x) @@ -117,15 +118,15 @@ class EncoderLayer(nn.Module): class AudioEncoder(nn.Module): def __init__( - self, - n_mels: int = 128, - n_ctx: int = 1500, - n_state: int = 1280, - n_head: int = 20, - n_layer: int = 32, - dtype=None, - device=None, - operations=None + self, + n_mels: int = 128, + n_ctx: int = 1500, + n_state: int = 1280, + n_head: int = 20, + n_layer: int = 32, + dtype=None, + device=None, + operations=None ): super().__init__() @@ -147,7 +148,7 @@ class AudioEncoder(nn.Module): x = x.transpose(1, 2) - x = x + comfy.ops.cast_to_input(self.embed_positions.weight[:, :x.shape[1]], x) + x = x + ops.cast_to_input(self.embed_positions.weight[:, :x.shape[1]], x) all_x = () for layer in self.layers: @@ -161,15 +162,15 @@ class AudioEncoder(nn.Module): class WhisperLargeV3(nn.Module): def __init__( - self, - n_mels: int = 128, - n_audio_ctx: int = 1500, - n_audio_state: int = 1280, - n_audio_head: int = 20, - n_audio_layer: int = 32, - dtype=None, - device=None, - operations=None + self, + n_mels: int = 128, + n_audio_ctx: int = 1500, + n_audio_state: int = 1280, + n_audio_head: int = 20, + n_audio_layer: int = 32, + dtype=None, + device=None, + operations=None ): super().__init__() diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index 85f9320f6..d7c6ed4b7 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -1,7 +1,7 @@ # Credits: # Original Flux code can be found on: https://github.com/black-forest-labs/flux # Chroma Radiance adaption referenced from https://github.com/lodestone-rock/flow - +import dataclasses from dataclasses import dataclass from typing import Optional @@ -66,6 +66,8 @@ class ChromaRadiance(Chroma): self.hidden_dim = params.hidden_dim self.n_layers = params.n_layers self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) + # replaces the operation + self.img_in = self._img_in self.img_in_patch = operations.Conv2d( params.in_channels, params.hidden_size, @@ -164,7 +166,7 @@ class ChromaRadiance(Chroma): # Impossible to get here as we raise an error on unexpected types on initialization. raise NotImplementedError - def img_in(self, img: Tensor) -> Tensor: + def _img_in(self, img: Tensor) -> Tensor: img = self.img_in_patch(img) # -> [B, Hidden, H/P, W/P] # flatten into a sequence for the transformer. return img.flatten(2).transpose(1, 2) # -> [B, NumPatches, Hidden] @@ -263,7 +265,7 @@ class ChromaRadiance(Chroma): params = self.params if not overrides: return params - params_dict = {k: getattr(params, k) for k in params.__dataclass_fields__} + params_dict = dataclasses.asdict(params) nullable_keys = frozenset(("nerf_embedder_dtype",)) bad_keys = tuple(k for k in overrides if k not in params_dict) if bad_keys: diff --git a/comfy/ldm/hunyuan_video/vae.py b/comfy/ldm/hunyuan_video/vae.py index 40c12b183..c1e7a5470 100644 --- a/comfy/ldm/hunyuan_video/vae.py +++ b/comfy/ldm/hunyuan_video/vae.py @@ -1,8 +1,7 @@ import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock -import comfy.ops -ops = comfy.ops.disable_weight_init +from ..modules.diffusionmodules.model import ResnetBlock, AttnBlock +from ...ops import disable_weight_init as ops class PixelShuffle2D(nn.Module): @@ -52,7 +51,7 @@ class Encoder(nn.Module): out_channels=tgt, temb_channels=0, conv_op=ops.Conv2d) - for j in range(num_res_blocks)]) + for j in range(num_res_blocks)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch @@ -112,7 +111,7 @@ class Decoder(nn.Module): out_channels=tgt, temb_channels=0, conv_op=ops.Conv2d) - for j in range(num_res_blocks + 1)]) + for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index c6f742710..6347dfd77 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -1,21 +1,22 @@ import torch import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d -import comfy.ops -import comfy.ldm.models.autoencoder -ops = comfy.ops.disable_weight_init +from ..modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d +from ..models.autoencoder import DiagonalGaussianRegularizer +from ...ops import disable_weight_init as ops + class RMS_norm(nn.Module): def __init__(self, dim): super().__init__() shape = (dim, 1, 1, 1) - self.scale = dim**0.5 + self.scale = dim ** 0.5 self.gamma = nn.Parameter(torch.empty(shape)) def forward(self, x): return F.normalize(x, dim=1) * self.scale * self.gamma + class DnSmpl(nn.Module): def __init__(self, ic, oc, tds=True): super().__init__() @@ -146,6 +147,7 @@ class UpSmpl(nn.Module): return h + sc + class Encoder(nn.Module): def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_): @@ -166,7 +168,7 @@ class Encoder(nn.Module): out_channels=tgt, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - for j in range(num_res_blocks)]) + for j in range(num_res_blocks)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch @@ -182,7 +184,7 @@ class Encoder(nn.Module): self.norm_out = RMS_norm(ch) self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1) - self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer() + self.regul = DiagonalGaussianRegularizer() def forward(self, x): x = self.conv_in(x) @@ -209,6 +211,7 @@ class Encoder(nn.Module): out = out.permute(0, 2, 1, 3, 4).contiguous() return out + class Decoder(nn.Module): def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks, ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_): @@ -236,7 +239,7 @@ class Decoder(nn.Module): out_channels=tgt, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - for j in range(num_res_blocks + 1)]) + for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py index 7c87835d4..9ca3a1bf8 100644 --- a/comfy/ldm/wan/model_animate.py +++ b/comfy/ldm/wan/model_animate.py @@ -5,8 +5,9 @@ from einops import rearrange import torch.nn.functional as F import math from .model import WanModel, sinusoidal_embedding_1d -from comfy.ldm.modules.attention import optimized_attention -import comfy.model_management +from ..modules.attention import optimized_attention +from ...model_management import cast_to + class CausalConv1d(nn.Module): @@ -46,7 +47,6 @@ class FaceEncoder(nn.Module): self.padding_tokens = nn.Parameter(torch.empty(1, 1, 1, hidden_dim, **factory_kwargs)) def forward(self, x): - x = rearrange(x, "b t c -> b c t") b, c, t = x.shape @@ -67,7 +67,7 @@ class FaceEncoder(nn.Module): x = self.act(x) x = self.out_proj(x) x = rearrange(x, "(b n) t c -> b t n c", b=b) - padding = comfy.model_management.cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1) + padding = cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1) x = torch.cat([x, padding], dim=-2) x_local = x.clone() @@ -94,15 +94,14 @@ def get_norm_layer(norm_layer, operations=None): class FaceAdapter(nn.Module): def __init__( - self, - hidden_dim: int, - heads_num: int, - qk_norm: bool = True, - qk_norm_type: str = "rms", - num_adapter_layers: int = 1, - dtype=None, device=None, operations=None + self, + hidden_dim: int, + heads_num: int, + qk_norm: bool = True, + qk_norm_type: str = "rms", + num_adapter_layers: int = 1, + dtype=None, device=None, operations=None ): - factory_kwargs = {"dtype": dtype, "device": device} super().__init__() self.hidden_size = hidden_dim @@ -122,29 +121,27 @@ class FaceAdapter(nn.Module): ) def forward( - self, - x: torch.Tensor, - motion_embed: torch.Tensor, - idx: int, - freqs_cis_q: Tuple[torch.Tensor, torch.Tensor] = None, - freqs_cis_k: Tuple[torch.Tensor, torch.Tensor] = None, + self, + x: torch.Tensor, + motion_embed: torch.Tensor, + idx: int, + freqs_cis_q: Tuple[torch.Tensor, torch.Tensor] = None, + freqs_cis_k: Tuple[torch.Tensor, torch.Tensor] = None, ) -> torch.Tensor: - return self.fuser_blocks[idx](x, motion_embed, freqs_cis_q, freqs_cis_k) - class FaceBlock(nn.Module): def __init__( - self, - hidden_size: int, - heads_num: int, - qk_norm: bool = True, - qk_norm_type: str = "rms", - qk_scale: float = None, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, - operations=None + self, + hidden_size: int, + heads_num: int, + qk_norm: bool = True, + qk_norm_type: str = "rms", + qk_scale: float = None, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + operations=None ): factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -153,7 +150,7 @@ class FaceBlock(nn.Module): self.hidden_size = hidden_size self.heads_num = heads_num head_dim = hidden_size // heads_num - self.scale = qk_scale or head_dim**-0.5 + self.scale = qk_scale or head_dim ** -0.5 self.linear1_kv = operations.Linear(hidden_size, hidden_size * 2, **factory_kwargs) self.linear1_q = operations.Linear(hidden_size, hidden_size, **factory_kwargs) @@ -173,13 +170,12 @@ class FaceBlock(nn.Module): self.pre_norm_motion = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) def forward( - self, - x: torch.Tensor, - motion_vec: torch.Tensor, - motion_mask: Optional[torch.Tensor] = None, - # use_context_parallel=False, + self, + x: torch.Tensor, + motion_vec: torch.Tensor, + motion_mask: Optional[torch.Tensor] = None, + # use_context_parallel=False, ) -> torch.Tensor: - B, T, N, C = motion_vec.shape T_comp = T @@ -212,6 +208,7 @@ class FaceBlock(nn.Module): return output + # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/upfirdn2d/upfirdn2d.py#L162 def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): _, minor, in_h, in_w = input.shape @@ -230,9 +227,11 @@ def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1) return out[:, :, ::down_y, ::down_x] + def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]) + # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/fused_act/fused_act.py#L81 class FusedLeakyReLU(torch.nn.Module): def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5, dtype=None, device=None): @@ -242,11 +241,13 @@ class FusedLeakyReLU(torch.nn.Module): self.scale = scale def forward(self, input): - return fused_leaky_relu(input, comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype), self.negative_slope, self.scale) + return fused_leaky_relu(input, cast_to(self.bias, device=input.device, dtype=input.dtype), self.negative_slope, self.scale) + def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5): return F.leaky_relu(input + bias, negative_slope) * scale + class Blur(torch.nn.Module): def __init__(self, kernel, pad, dtype=None, device=None): super().__init__() @@ -257,9 +258,10 @@ class Blur(torch.nn.Module): self.pad = pad def forward(self, input): - return upfirdn2d(input, comfy.model_management.cast_to(self.kernel, dtype=input.dtype, device=input.device), pad=self.pad) + return upfirdn2d(input, cast_to(self.kernel, dtype=input.dtype, device=input.device), pad=self.pad) -#https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L590 + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L590 class ScaledLeakyReLU(torch.nn.Module): def __init__(self, negative_slope=0.2): super().__init__() @@ -268,6 +270,7 @@ class ScaledLeakyReLU(torch.nn.Module): def forward(self, input): return F.leaky_relu(input, negative_slope=self.negative_slope) + # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L605 class EqualConv2d(torch.nn.Module): def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True, dtype=None, device=None, operations=None): @@ -282,9 +285,10 @@ class EqualConv2d(torch.nn.Module): if self.bias is None: bias = None else: - bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype) + bias = cast_to(self.bias, device=input.device, dtype=input.dtype) + + return F.conv2d(input, cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias, stride=self.stride, padding=self.padding) - return F.conv2d(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias, stride=self.stride, padding=self.padding) # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L134 class EqualLinear(torch.nn.Module): @@ -300,12 +304,13 @@ class EqualLinear(torch.nn.Module): if self.bias is None: bias = None else: - bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype) * self.lr_mul + bias = cast_to(self.bias, device=input.device, dtype=input.dtype) * self.lr_mul if self.activation: - out = F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale) + out = F.linear(input, cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale) return fused_leaky_relu(out, bias) - return F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias) + return F.linear(input, cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias) + # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L654 class ConvLayer(torch.nn.Sequential): @@ -327,6 +332,7 @@ class ConvLayer(torch.nn.Sequential): super().__init__(*layers) + # https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L704 class ResBlock(torch.nn.Module): def __init__(self, in_channel, out_channel, dtype=None, device=None, operations=None): @@ -360,6 +366,7 @@ class EncoderApp(torch.nn.Module): h = conv(h) return h.squeeze(-1).squeeze(-1) + class Encoder(torch.nn.Module): def __init__(self, dim=512, motion_dim=20, dtype=None, device=None, operations=None): super().__init__() @@ -369,6 +376,7 @@ class Encoder(torch.nn.Module): def encode_motion(self, x): return self.fc(self.net_app(x)) + class Direction(torch.nn.Module): def __init__(self, motion_dim, dtype=None, device=None, operations=None): super().__init__() @@ -376,17 +384,19 @@ class Direction(torch.nn.Module): self.motion_dim = motion_dim def forward(self, input): - stabilized_weight = comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) + 1e-8 * torch.eye(512, self.motion_dim, device=input.device, dtype=input.dtype) + stabilized_weight = cast_to(self.weight, device=input.device, dtype=input.dtype) + 1e-8 * torch.eye(512, self.motion_dim, device=input.device, dtype=input.dtype) Q, _ = torch.linalg.qr(stabilized_weight.float()) if input is None: return Q return torch.sum(input.unsqueeze(-1) * Q.T.to(input.dtype), dim=1) + class Synthesis(torch.nn.Module): def __init__(self, motion_dim, dtype=None, device=None, operations=None): super().__init__() self.direction = Direction(motion_dim, dtype=dtype, device=device, operations=operations) + class Generator(torch.nn.Module): def __init__(self, style_dim=512, motion_dim=20, dtype=None, device=None, operations=None): super().__init__() @@ -397,6 +407,7 @@ class Generator(torch.nn.Module): motion_feat = self.enc.encode_motion(img) return self.dec.direction(motion_feat) + class AnimateWanModel(WanModel): r""" Wan diffusion backbone supporting both text-to-video and image-to-video. @@ -481,16 +492,16 @@ class AnimateWanModel(WanModel): return x, motion_vec def forward_orig( - self, - x, - t, - context, - clip_fea=None, - pose_latents=None, - face_pixel_values=None, - freqs=None, - transformer_options={}, - **kwargs, + self, + x, + t, + context, + clip_fea=None, + pose_latents=None, + face_pixel_values=None, + freqs=None, + transformer_options={}, + **kwargs, ): # embeddings x = self.patch_embedding(x.float()).to(x.dtype) @@ -529,6 +540,7 @@ class AnimateWanModel(WanModel): out = {} out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"]) return out + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: diff --git a/comfy/nodes/base_nodes.py b/comfy/nodes/base_nodes.py index c8a531fae..8b7b52eca 100644 --- a/comfy/nodes/base_nodes.py +++ b/comfy/nodes/base_nodes.py @@ -811,12 +811,12 @@ class VAELoader: # TODO: scale factor? def load_vae(self, vae_name): + metadata = {} if vae_name == "pixel_space": - sd = {} - sd["pixel_space_vae"] = torch.tensor(1.0) + sd_ = {} + sd_["pixel_space_vae"] = torch.tensor(1.0) elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]: sd_ = self.load_taesd(vae_name) - metadata = {} else: vae_path = get_full_path_or_raise("vae", vae_name, KNOWN_VAES) sd_, metadata = utils.load_torch_file(vae_path, return_metadata=True) diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index ee20b0122..c9188a674 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -1,10 +1,11 @@ -from .. import sd1_clip -from .llama import Qwen25_7BVLI -from .qwen_image import QwenImageTokenizer, QwenImageTEModel -from transformers import ByT5Tokenizer -import os import re +from transformers import ByT5Tokenizer + +from .llama import Qwen25_7BVLI +from .qwen_image import QwenImageTokenizer, QwenImageTEModel +from .t5 import T5 +from .. import sd1_clip from ..component_model import files @@ -64,7 +65,7 @@ class ByT5SmallModel(sd1_clip.SDClipModel): model_options = {} textmodel_json_config = files.get_path_as_dict(textmodel_json_config, "byt5_config_small_glyph.json", package=__package__) - super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True) + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=T5, enable_attention_masks=True, zero_out_masked=True) class HunyuanImageTEModel(QwenImageTEModel):