diff --git a/comfy/ldm/audio/__init__.py b/comfy/ldm/audio/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/ldm/audio/autoencoder.py b/comfy/ldm/audio/autoencoder.py index 8123e66a5..e758b3ade 100644 --- a/comfy/ldm/audio/autoencoder.py +++ b/comfy/ldm/audio/autoencoder.py @@ -4,8 +4,8 @@ import torch from torch import nn from typing import Literal, Dict, Any import math -import comfy.ops -ops = comfy.ops.disable_weight_init +from ... import ops +ops = ops.disable_weight_init def vae_sample(mean, scale): stdev = nn.functional.softplus(scale) + 1e-4 diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index 1c1112c5e..cf8bb89b8 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -1,6 +1,7 @@ # code adapted from: https://github.com/Stability-AI/stable-audio-tools +from einops.layers.torch import Rearrange -from comfy.ldm.modules.attention import optimized_attention +from ..modules.attention import optimized_attention import typing as tp import torch @@ -153,6 +154,8 @@ class RotaryEmbedding(nn.Module): return self.forward(t) def forward(self, t): + # todo: ??? + seq_len = 0 # device = self.inv_freq.device device = t.device dtype = t.dtype @@ -343,7 +346,7 @@ class Attention(nn.Module): # determine masking masks = [] - final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account + # todo: ??? if input_mask is not None: input_mask = rearrange(input_mask, 'b j -> b 1 1 j') @@ -351,9 +354,6 @@ class Attention(nn.Module): # Other masks will be added here later - if len(masks) > 0: - final_attn_mask = ~or_reduce(masks) - n, device = q.shape[-2], q.device causal = self.causal if causal is None else causal diff --git a/comfy/ldm/audio/embedders.py b/comfy/ldm/audio/embedders.py index 82a3210c6..b38689b30 100644 --- a/comfy/ldm/audio/embedders.py +++ b/comfy/ldm/audio/embedders.py @@ -6,7 +6,7 @@ from torch import Tensor, einsum from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union from einops import rearrange import math -import comfy.ops +from ... import ops class LearnedPositionalEmbedding(nn.Module): """Used for continuous time""" @@ -27,7 +27,7 @@ class LearnedPositionalEmbedding(nn.Module): def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module: return nn.Sequential( LearnedPositionalEmbedding(dim), - comfy.ops.manual_cast.Linear(in_features=dim + 1, out_features=out_features), + ops.manual_cast.Linear(in_features=dim + 1, out_features=out_features), )