mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-10 21:42:37 +08:00
Merge remote-tracking branch 'upstream/master' into addBatchIndex
This commit is contained in:
commit
39cd0f5243
10
README.md
10
README.md
@ -11,7 +11,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
|
||||
|
||||
## Features
|
||||
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
|
||||
- Fully supports SD1.x and SD2.x
|
||||
- Fully supports SD1.x, SD2.x and SDXL
|
||||
- Asynchronous Queue system
|
||||
- Many optimizations: Only re-executes the parts of the workflow that changes between executions.
|
||||
- Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram)
|
||||
@ -154,11 +154,13 @@ And then you can use that terminal to run ComfyUI without installing any depende
|
||||
|
||||
```python main.py```
|
||||
|
||||
### For AMD 6700, 6600 and maybe others
|
||||
### For AMD cards not officially supported by ROCm
|
||||
|
||||
Try running it with this command if you have issues:
|
||||
|
||||
```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
|
||||
For 6700, 6600 and maybe other RDNA2 or older: ```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
|
||||
|
||||
For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 python main.py```
|
||||
|
||||
# Notes
|
||||
|
||||
@ -191,7 +193,7 @@ You can set this command line setting to disable the upcasting to fp32 in some c
|
||||
|
||||
Use ```--preview-method auto``` to enable previews.
|
||||
|
||||
The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_encoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_encoder.pth) and [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
|
||||
The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
|
||||
|
||||
## Support and dev channel
|
||||
|
||||
|
||||
@ -34,8 +34,10 @@ class ControlNet(nn.Module):
|
||||
channel_mult=(1, 2, 4, 8),
|
||||
conv_resample=True,
|
||||
dims=2,
|
||||
num_classes=None,
|
||||
use_checkpoint=False,
|
||||
use_fp16=False,
|
||||
use_bf16=False,
|
||||
num_heads=-1,
|
||||
num_head_channels=-1,
|
||||
num_heads_upsample=-1,
|
||||
@ -51,6 +53,8 @@ class ControlNet(nn.Module):
|
||||
num_attention_blocks=None,
|
||||
disable_middle_self_attn=False,
|
||||
use_linear_in_transformer=False,
|
||||
adm_in_channels=None,
|
||||
transformer_depth_middle=None,
|
||||
):
|
||||
super().__init__()
|
||||
if use_spatial_transformer:
|
||||
@ -75,6 +79,10 @@ class ControlNet(nn.Module):
|
||||
self.image_size = image_size
|
||||
self.in_channels = in_channels
|
||||
self.model_channels = model_channels
|
||||
if isinstance(transformer_depth, int):
|
||||
transformer_depth = len(channel_mult) * [transformer_depth]
|
||||
if transformer_depth_middle is None:
|
||||
transformer_depth_middle = transformer_depth[-1]
|
||||
if isinstance(num_res_blocks, int):
|
||||
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
||||
else:
|
||||
@ -97,8 +105,10 @@ class ControlNet(nn.Module):
|
||||
self.dropout = dropout
|
||||
self.channel_mult = channel_mult
|
||||
self.conv_resample = conv_resample
|
||||
self.num_classes = num_classes
|
||||
self.use_checkpoint = use_checkpoint
|
||||
self.dtype = th.float16 if use_fp16 else th.float32
|
||||
self.dtype = th.bfloat16 if use_bf16 else self.dtype
|
||||
self.num_heads = num_heads
|
||||
self.num_head_channels = num_head_channels
|
||||
self.num_heads_upsample = num_heads_upsample
|
||||
@ -111,6 +121,24 @@ class ControlNet(nn.Module):
|
||||
linear(time_embed_dim, time_embed_dim),
|
||||
)
|
||||
|
||||
if self.num_classes is not None:
|
||||
if isinstance(self.num_classes, int):
|
||||
self.label_emb = nn.Embedding(num_classes, time_embed_dim)
|
||||
elif self.num_classes == "continuous":
|
||||
print("setting up linear c_adm embedding layer")
|
||||
self.label_emb = nn.Linear(1, time_embed_dim)
|
||||
elif self.num_classes == "sequential":
|
||||
assert adm_in_channels is not None
|
||||
self.label_emb = nn.Sequential(
|
||||
nn.Sequential(
|
||||
linear(adm_in_channels, time_embed_dim),
|
||||
nn.SiLU(),
|
||||
linear(time_embed_dim, time_embed_dim),
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
self.input_blocks = nn.ModuleList(
|
||||
[
|
||||
TimestepEmbedSequential(
|
||||
@ -179,7 +207,7 @@ class ControlNet(nn.Module):
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||
ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim,
|
||||
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
@ -238,7 +266,7 @@ class ControlNet(nn.Module):
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||
ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
|
||||
disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint
|
||||
),
|
||||
@ -257,7 +285,7 @@ class ControlNet(nn.Module):
|
||||
def make_zero_conv(self, channels):
|
||||
return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
|
||||
|
||||
def forward(self, x, hint, timesteps, context, **kwargs):
|
||||
def forward(self, x, hint, timesteps, context, y=None, **kwargs):
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||
emb = self.time_embed(t_emb)
|
||||
|
||||
@ -265,6 +293,14 @@ class ControlNet(nn.Module):
|
||||
|
||||
outs = []
|
||||
|
||||
hs = []
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||
emb = self.time_embed(t_emb)
|
||||
|
||||
if self.num_classes is not None:
|
||||
assert y.shape[0] == x.shape[0]
|
||||
emb = emb + self.label_emb(y)
|
||||
|
||||
h = x.type(self.dtype)
|
||||
for module, zero_conv in zip(self.input_blocks, self.zero_convs):
|
||||
if guided_hint is not None:
|
||||
|
||||
23
comfy/clip_config_bigg.json
Normal file
23
comfy/clip_config_bigg.json
Normal file
@ -0,0 +1,23 @@
|
||||
{
|
||||
"architectures": [
|
||||
"CLIPTextModel"
|
||||
],
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 0,
|
||||
"dropout": 0.0,
|
||||
"eos_token_id": 2,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_size": 1280,
|
||||
"initializer_factor": 1.0,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 5120,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"max_position_embeddings": 77,
|
||||
"model_type": "clip_text_model",
|
||||
"num_attention_heads": 20,
|
||||
"num_hidden_layers": 32,
|
||||
"pad_token_id": 1,
|
||||
"projection_dim": 512,
|
||||
"torch_dtype": "float32",
|
||||
"vocab_size": 49408
|
||||
}
|
||||
@ -29,31 +29,32 @@ class ClipVisionModel():
|
||||
outputs = self.model(**inputs)
|
||||
return outputs
|
||||
|
||||
def convert_to_transformers(sd):
|
||||
def convert_to_transformers(sd, prefix):
|
||||
sd_k = sd.keys()
|
||||
if "embedder.model.visual.transformer.resblocks.0.attn.in_proj_weight" in sd_k:
|
||||
if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k:
|
||||
keys_to_replace = {
|
||||
"embedder.model.visual.class_embedding": "vision_model.embeddings.class_embedding",
|
||||
"embedder.model.visual.conv1.weight": "vision_model.embeddings.patch_embedding.weight",
|
||||
"embedder.model.visual.positional_embedding": "vision_model.embeddings.position_embedding.weight",
|
||||
"embedder.model.visual.ln_post.bias": "vision_model.post_layernorm.bias",
|
||||
"embedder.model.visual.ln_post.weight": "vision_model.post_layernorm.weight",
|
||||
"embedder.model.visual.ln_pre.bias": "vision_model.pre_layrnorm.bias",
|
||||
"embedder.model.visual.ln_pre.weight": "vision_model.pre_layrnorm.weight",
|
||||
"{}class_embedding".format(prefix): "vision_model.embeddings.class_embedding",
|
||||
"{}conv1.weight".format(prefix): "vision_model.embeddings.patch_embedding.weight",
|
||||
"{}positional_embedding".format(prefix): "vision_model.embeddings.position_embedding.weight",
|
||||
"{}ln_post.bias".format(prefix): "vision_model.post_layernorm.bias",
|
||||
"{}ln_post.weight".format(prefix): "vision_model.post_layernorm.weight",
|
||||
"{}ln_pre.bias".format(prefix): "vision_model.pre_layrnorm.bias",
|
||||
"{}ln_pre.weight".format(prefix): "vision_model.pre_layrnorm.weight",
|
||||
}
|
||||
|
||||
for x in keys_to_replace:
|
||||
if x in sd_k:
|
||||
sd[keys_to_replace[x]] = sd.pop(x)
|
||||
|
||||
if "embedder.model.visual.proj" in sd_k:
|
||||
sd['visual_projection.weight'] = sd.pop("embedder.model.visual.proj").transpose(0, 1)
|
||||
if "{}proj".format(prefix) in sd_k:
|
||||
sd['visual_projection.weight'] = sd.pop("{}proj".format(prefix)).transpose(0, 1)
|
||||
|
||||
sd = transformers_convert(sd, "embedder.model.visual", "vision_model", 32)
|
||||
sd = transformers_convert(sd, prefix, "vision_model.", 32)
|
||||
return sd
|
||||
|
||||
def load_clipvision_from_sd(sd):
|
||||
sd = convert_to_transformers(sd)
|
||||
def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
||||
if convert_keys:
|
||||
sd = convert_to_transformers(sd, prefix)
|
||||
if "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
||||
else:
|
||||
|
||||
@ -77,7 +77,7 @@ class BatchedBrownianTree:
|
||||
except TypeError:
|
||||
seed = [seed]
|
||||
self.batched = False
|
||||
self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
|
||||
self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed]
|
||||
|
||||
@staticmethod
|
||||
def sort(a, b):
|
||||
@ -85,7 +85,7 @@ class BatchedBrownianTree:
|
||||
|
||||
def __call__(self, t0, t1):
|
||||
t0, t1, sign = self.sort(t0, t1)
|
||||
w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
|
||||
w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign)
|
||||
return w if self.batched else w[0]
|
||||
|
||||
|
||||
@ -543,7 +543,8 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
|
||||
def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
|
||||
"""DPM-Solver++ (stochastic)."""
|
||||
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
||||
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max) if noise_sampler is None else noise_sampler
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed) if noise_sampler is None else noise_sampler
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda t: t.neg().exp()
|
||||
@ -613,8 +614,9 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
if solver_type not in {'heun', 'midpoint'}:
|
||||
raise ValueError('solver_type must be \'heun\' or \'midpoint\'')
|
||||
|
||||
seed = extra_args.get("seed", None)
|
||||
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
||||
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max) if noise_sampler is None else noise_sampler
|
||||
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed) if noise_sampler is None else noise_sampler
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
|
||||
31
comfy/latent_formats.py
Normal file
31
comfy/latent_formats.py
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
class LatentFormat:
|
||||
def process_in(self, latent):
|
||||
return latent * self.scale_factor
|
||||
|
||||
def process_out(self, latent):
|
||||
return latent / self.scale_factor
|
||||
|
||||
class SD15(LatentFormat):
|
||||
def __init__(self, scale_factor=0.18215):
|
||||
self.scale_factor = scale_factor
|
||||
self.latent_rgb_factors = [
|
||||
# R G B
|
||||
[0.298, 0.207, 0.208], # L1
|
||||
[0.187, 0.286, 0.173], # L2
|
||||
[-0.158, 0.189, 0.264], # L3
|
||||
[-0.184, -0.271, -0.473], # L4
|
||||
]
|
||||
self.taesd_decoder_name = "taesd_decoder.pth"
|
||||
|
||||
class SDXL(LatentFormat):
|
||||
def __init__(self):
|
||||
self.scale_factor = 0.13025
|
||||
self.latent_rgb_factors = [ #TODO: these are the factors for SD1.5, need to estimate new ones for SDXL
|
||||
# R G B
|
||||
[0.298, 0.207, 0.208], # L1
|
||||
[0.187, 0.286, 0.173], # L2
|
||||
[-0.158, 0.189, 0.264], # L3
|
||||
[-0.184, -0.271, -0.473], # L4
|
||||
]
|
||||
self.taesd_decoder_name = "taesdxl_decoder.pth"
|
||||
@ -180,6 +180,12 @@ class DDIMSampler(object):
|
||||
)
|
||||
return samples, intermediates
|
||||
|
||||
def q_sample(self, x_start, t, noise=None):
|
||||
if noise is None:
|
||||
noise = torch.randn_like(x_start)
|
||||
return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
|
||||
extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
|
||||
|
||||
@torch.no_grad()
|
||||
def ddim_sampling(self, cond, shape,
|
||||
x_T=None, ddim_use_original_steps=False,
|
||||
@ -214,7 +220,7 @@ class DDIMSampler(object):
|
||||
|
||||
if mask is not None:
|
||||
assert x0 is not None
|
||||
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||
img_orig = self.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1. - mask) * img
|
||||
|
||||
if ucg_schedule is not None:
|
||||
|
||||
@ -12,8 +12,6 @@ from .sub_quadratic_attention import efficient_dot_product_attention
|
||||
from comfy import model_management
|
||||
import comfy.ops
|
||||
|
||||
from . import tomesd
|
||||
|
||||
if model_management.xformers_enabled():
|
||||
import xformers
|
||||
import xformers.ops
|
||||
@ -519,23 +517,39 @@ class BasicTransformerBlock(nn.Module):
|
||||
self.norm2 = nn.LayerNorm(dim, dtype=dtype)
|
||||
self.norm3 = nn.LayerNorm(dim, dtype=dtype)
|
||||
self.checkpoint = checkpoint
|
||||
self.n_heads = n_heads
|
||||
self.d_head = d_head
|
||||
|
||||
def forward(self, x, context=None, transformer_options={}):
|
||||
return checkpoint(self._forward, (x, context, transformer_options), self.parameters(), self.checkpoint)
|
||||
|
||||
def _forward(self, x, context=None, transformer_options={}):
|
||||
extra_options = {}
|
||||
block = None
|
||||
block_index = 0
|
||||
if "current_index" in transformer_options:
|
||||
extra_options["transformer_index"] = transformer_options["current_index"]
|
||||
if "block_index" in transformer_options:
|
||||
extra_options["block_index"] = transformer_options["block_index"]
|
||||
block_index = transformer_options["block_index"]
|
||||
extra_options["block_index"] = block_index
|
||||
if "original_shape" in transformer_options:
|
||||
extra_options["original_shape"] = transformer_options["original_shape"]
|
||||
if "block" in transformer_options:
|
||||
block = transformer_options["block"]
|
||||
extra_options["block"] = block
|
||||
if "patches" in transformer_options:
|
||||
transformer_patches = transformer_options["patches"]
|
||||
else:
|
||||
transformer_patches = {}
|
||||
|
||||
extra_options["n_heads"] = self.n_heads
|
||||
extra_options["dim_head"] = self.d_head
|
||||
|
||||
if "patches_replace" in transformer_options:
|
||||
transformer_patches_replace = transformer_options["patches_replace"]
|
||||
else:
|
||||
transformer_patches_replace = {}
|
||||
|
||||
n = self.norm1(x)
|
||||
if self.disable_self_attn:
|
||||
context_attn1 = context
|
||||
@ -551,12 +565,32 @@ class BasicTransformerBlock(nn.Module):
|
||||
for p in patch:
|
||||
n, context_attn1, value_attn1 = p(n, context_attn1, value_attn1, extra_options)
|
||||
|
||||
if "tomesd" in transformer_options:
|
||||
m, u = tomesd.get_functions(x, transformer_options["tomesd"]["ratio"], transformer_options["original_shape"])
|
||||
n = u(self.attn1(m(n), context=context_attn1, value=value_attn1))
|
||||
if block is not None:
|
||||
transformer_block = (block[0], block[1], block_index)
|
||||
else:
|
||||
transformer_block = None
|
||||
attn1_replace_patch = transformer_patches_replace.get("attn1", {})
|
||||
block_attn1 = transformer_block
|
||||
if block_attn1 not in attn1_replace_patch:
|
||||
block_attn1 = block
|
||||
|
||||
if block_attn1 in attn1_replace_patch:
|
||||
if context_attn1 is None:
|
||||
context_attn1 = n
|
||||
value_attn1 = n
|
||||
n = self.attn1.to_q(n)
|
||||
context_attn1 = self.attn1.to_k(context_attn1)
|
||||
value_attn1 = self.attn1.to_v(value_attn1)
|
||||
n = attn1_replace_patch[block_attn1](n, context_attn1, value_attn1, extra_options)
|
||||
n = self.attn1.to_out(n)
|
||||
else:
|
||||
n = self.attn1(n, context=context_attn1, value=value_attn1)
|
||||
|
||||
if "attn1_output_patch" in transformer_patches:
|
||||
patch = transformer_patches["attn1_output_patch"]
|
||||
for p in patch:
|
||||
n = p(n, extra_options)
|
||||
|
||||
x += n
|
||||
if "middle_patch" in transformer_patches:
|
||||
patch = transformer_patches["middle_patch"]
|
||||
@ -573,7 +607,21 @@ class BasicTransformerBlock(nn.Module):
|
||||
for p in patch:
|
||||
n, context_attn2, value_attn2 = p(n, context_attn2, value_attn2, extra_options)
|
||||
|
||||
n = self.attn2(n, context=context_attn2, value=value_attn2)
|
||||
attn2_replace_patch = transformer_patches_replace.get("attn2", {})
|
||||
block_attn2 = transformer_block
|
||||
if block_attn2 not in attn2_replace_patch:
|
||||
block_attn2 = block
|
||||
|
||||
if block_attn2 in attn2_replace_patch:
|
||||
if value_attn2 is None:
|
||||
value_attn2 = context_attn2
|
||||
n = self.attn2.to_q(n)
|
||||
context_attn2 = self.attn2.to_k(context_attn2)
|
||||
value_attn2 = self.attn2.to_v(value_attn2)
|
||||
n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options)
|
||||
n = self.attn2.to_out(n)
|
||||
else:
|
||||
n = self.attn2(n, context=context_attn2, value=value_attn2)
|
||||
|
||||
if "attn2_output_patch" in transformer_patches:
|
||||
patch = transformer_patches["attn2_output_patch"]
|
||||
@ -600,7 +648,7 @@ class SpatialTransformer(nn.Module):
|
||||
use_checkpoint=True, dtype=None):
|
||||
super().__init__()
|
||||
if exists(context_dim) and not isinstance(context_dim, list):
|
||||
context_dim = [context_dim]
|
||||
context_dim = [context_dim] * depth
|
||||
self.in_channels = in_channels
|
||||
inner_dim = n_heads * d_head
|
||||
self.norm = Normalize(in_channels, dtype=dtype)
|
||||
@ -630,7 +678,7 @@ class SpatialTransformer(nn.Module):
|
||||
def forward(self, x, context=None, transformer_options={}):
|
||||
# note: if no context is given, cross-attention defaults to self-attention
|
||||
if not isinstance(context, list):
|
||||
context = [context]
|
||||
context = [context] * len(self.transformer_blocks)
|
||||
b, c, h, w = x.shape
|
||||
x_in = x
|
||||
x = self.norm(x)
|
||||
|
||||
@ -735,203 +735,3 @@ class Decoder(nn.Module):
|
||||
if self.tanh_out:
|
||||
h = torch.tanh(h)
|
||||
return h
|
||||
|
||||
|
||||
class SimpleDecoder(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, *args, **kwargs):
|
||||
super().__init__()
|
||||
self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
|
||||
ResnetBlock(in_channels=in_channels,
|
||||
out_channels=2 * in_channels,
|
||||
temb_channels=0, dropout=0.0),
|
||||
ResnetBlock(in_channels=2 * in_channels,
|
||||
out_channels=4 * in_channels,
|
||||
temb_channels=0, dropout=0.0),
|
||||
ResnetBlock(in_channels=4 * in_channels,
|
||||
out_channels=2 * in_channels,
|
||||
temb_channels=0, dropout=0.0),
|
||||
nn.Conv2d(2*in_channels, in_channels, 1),
|
||||
Upsample(in_channels, with_conv=True)])
|
||||
# end
|
||||
self.norm_out = Normalize(in_channels)
|
||||
self.conv_out = torch.nn.Conv2d(in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1)
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.model):
|
||||
if i in [1,2,3]:
|
||||
x = layer(x, None)
|
||||
else:
|
||||
x = layer(x)
|
||||
|
||||
h = self.norm_out(x)
|
||||
h = nonlinearity(h)
|
||||
x = self.conv_out(h)
|
||||
return x
|
||||
|
||||
|
||||
class UpsampleDecoder(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
|
||||
ch_mult=(2,2), dropout=0.0):
|
||||
super().__init__()
|
||||
# upsampling
|
||||
self.temb_ch = 0
|
||||
self.num_resolutions = len(ch_mult)
|
||||
self.num_res_blocks = num_res_blocks
|
||||
block_in = in_channels
|
||||
curr_res = resolution // 2 ** (self.num_resolutions - 1)
|
||||
self.res_blocks = nn.ModuleList()
|
||||
self.upsample_blocks = nn.ModuleList()
|
||||
for i_level in range(self.num_resolutions):
|
||||
res_block = []
|
||||
block_out = ch * ch_mult[i_level]
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
res_block.append(ResnetBlock(in_channels=block_in,
|
||||
out_channels=block_out,
|
||||
temb_channels=self.temb_ch,
|
||||
dropout=dropout))
|
||||
block_in = block_out
|
||||
self.res_blocks.append(nn.ModuleList(res_block))
|
||||
if i_level != self.num_resolutions - 1:
|
||||
self.upsample_blocks.append(Upsample(block_in, True))
|
||||
curr_res = curr_res * 2
|
||||
|
||||
# end
|
||||
self.norm_out = Normalize(block_in)
|
||||
self.conv_out = torch.nn.Conv2d(block_in,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1)
|
||||
|
||||
def forward(self, x):
|
||||
# upsampling
|
||||
h = x
|
||||
for k, i_level in enumerate(range(self.num_resolutions)):
|
||||
for i_block in range(self.num_res_blocks + 1):
|
||||
h = self.res_blocks[i_level][i_block](h, None)
|
||||
if i_level != self.num_resolutions - 1:
|
||||
h = self.upsample_blocks[k](h)
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = self.conv_out(h)
|
||||
return h
|
||||
|
||||
|
||||
class LatentRescaler(nn.Module):
|
||||
def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
|
||||
super().__init__()
|
||||
# residual block, interpolate, residual block
|
||||
self.factor = factor
|
||||
self.conv_in = nn.Conv2d(in_channels,
|
||||
mid_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1)
|
||||
self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
temb_channels=0,
|
||||
dropout=0.0) for _ in range(depth)])
|
||||
self.attn = AttnBlock(mid_channels)
|
||||
self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
|
||||
out_channels=mid_channels,
|
||||
temb_channels=0,
|
||||
dropout=0.0) for _ in range(depth)])
|
||||
|
||||
self.conv_out = nn.Conv2d(mid_channels,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_in(x)
|
||||
for block in self.res_block1:
|
||||
x = block(x, None)
|
||||
x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
|
||||
x = self.attn(x)
|
||||
for block in self.res_block2:
|
||||
x = block(x, None)
|
||||
x = self.conv_out(x)
|
||||
return x
|
||||
|
||||
|
||||
class MergedRescaleEncoder(nn.Module):
|
||||
def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
|
||||
attn_resolutions, dropout=0.0, resamp_with_conv=True,
|
||||
ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
|
||||
super().__init__()
|
||||
intermediate_chn = ch * ch_mult[-1]
|
||||
self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
|
||||
z_channels=intermediate_chn, double_z=False, resolution=resolution,
|
||||
attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
|
||||
out_ch=None)
|
||||
self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
|
||||
mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.encoder(x)
|
||||
x = self.rescaler(x)
|
||||
return x
|
||||
|
||||
|
||||
class MergedRescaleDecoder(nn.Module):
|
||||
def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
|
||||
dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
|
||||
super().__init__()
|
||||
tmp_chn = z_channels*ch_mult[-1]
|
||||
self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
|
||||
resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
|
||||
ch_mult=ch_mult, resolution=resolution, ch=ch)
|
||||
self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
|
||||
out_channels=tmp_chn, depth=rescale_module_depth)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.rescaler(x)
|
||||
x = self.decoder(x)
|
||||
return x
|
||||
|
||||
|
||||
class Upsampler(nn.Module):
|
||||
def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
|
||||
super().__init__()
|
||||
assert out_size >= in_size
|
||||
num_blocks = int(np.log2(out_size//in_size))+1
|
||||
factor_up = 1.+ (out_size % in_size)
|
||||
print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
|
||||
self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
|
||||
out_channels=in_channels)
|
||||
self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
|
||||
attn_resolutions=[], in_channels=None, ch=in_channels,
|
||||
ch_mult=[ch_mult for _ in range(num_blocks)])
|
||||
|
||||
def forward(self, x):
|
||||
x = self.rescaler(x)
|
||||
x = self.decoder(x)
|
||||
return x
|
||||
|
||||
|
||||
class Resize(nn.Module):
|
||||
def __init__(self, in_channels=None, learned=False, mode="bilinear"):
|
||||
super().__init__()
|
||||
self.with_conv = learned
|
||||
self.mode = mode
|
||||
if self.with_conv:
|
||||
print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
|
||||
raise NotImplementedError()
|
||||
assert in_channels is not None
|
||||
# no asymmetric padding in torch conv, must do it ourselves
|
||||
self.conv = torch.nn.Conv2d(in_channels,
|
||||
in_channels,
|
||||
kernel_size=4,
|
||||
stride=2,
|
||||
padding=1)
|
||||
|
||||
def forward(self, x, scale_factor=1.0):
|
||||
if scale_factor==1.0:
|
||||
return x
|
||||
else:
|
||||
x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
|
||||
return x
|
||||
|
||||
@ -502,6 +502,7 @@ class UNetModel(nn.Module):
|
||||
disable_middle_self_attn=False,
|
||||
use_linear_in_transformer=False,
|
||||
adm_in_channels=None,
|
||||
transformer_depth_middle=None,
|
||||
):
|
||||
super().__init__()
|
||||
if use_spatial_transformer:
|
||||
@ -526,6 +527,10 @@ class UNetModel(nn.Module):
|
||||
self.in_channels = in_channels
|
||||
self.model_channels = model_channels
|
||||
self.out_channels = out_channels
|
||||
if isinstance(transformer_depth, int):
|
||||
transformer_depth = len(channel_mult) * [transformer_depth]
|
||||
if transformer_depth_middle is None:
|
||||
transformer_depth_middle = transformer_depth[-1]
|
||||
if isinstance(num_res_blocks, int):
|
||||
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
||||
else:
|
||||
@ -631,7 +636,7 @@ class UNetModel(nn.Module):
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||
ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim,
|
||||
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint, dtype=self.dtype
|
||||
)
|
||||
@ -690,7 +695,7 @@ class UNetModel(nn.Module):
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||
ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
|
||||
disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint, dtype=self.dtype
|
||||
),
|
||||
@ -746,7 +751,7 @@ class UNetModel(nn.Module):
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||
ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim,
|
||||
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint, dtype=self.dtype
|
||||
)
|
||||
@ -825,17 +830,20 @@ class UNetModel(nn.Module):
|
||||
|
||||
h = x.type(self.dtype)
|
||||
for id, module in enumerate(self.input_blocks):
|
||||
transformer_options["block"] = ("input", id)
|
||||
h = forward_timestep_embed(module, h, emb, context, transformer_options)
|
||||
if control is not None and 'input' in control and len(control['input']) > 0:
|
||||
ctrl = control['input'].pop()
|
||||
if ctrl is not None:
|
||||
h += ctrl
|
||||
hs.append(h)
|
||||
transformer_options["block"] = ("middle", 0)
|
||||
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options)
|
||||
if control is not None and 'middle' in control and len(control['middle']) > 0:
|
||||
h += control['middle'].pop()
|
||||
|
||||
for module in self.output_blocks:
|
||||
for id, module in enumerate(self.output_blocks):
|
||||
transformer_options["block"] = ("output", id)
|
||||
hsp = hs.pop()
|
||||
if control is not None and 'output' in control and len(control['output']) > 0:
|
||||
ctrl = control['output'].pop()
|
||||
|
||||
@ -2,12 +2,15 @@ import torch
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel
|
||||
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
|
||||
from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Timestep
|
||||
import numpy as np
|
||||
|
||||
class BaseModel(torch.nn.Module):
|
||||
def __init__(self, unet_config, v_prediction=False):
|
||||
def __init__(self, model_config, v_prediction=False):
|
||||
super().__init__()
|
||||
|
||||
unet_config = model_config.unet_config
|
||||
self.latent_format = model_config.latent_format
|
||||
self.register_schedule(given_betas=None, beta_schedule="linear", timesteps=1000, linear_start=0.00085, linear_end=0.012, cosine_s=8e-3)
|
||||
self.diffusion_model = UNetModel(**unet_config)
|
||||
self.v_prediction = v_prediction
|
||||
@ -15,9 +18,9 @@ class BaseModel(torch.nn.Module):
|
||||
self.parameterization = "v"
|
||||
else:
|
||||
self.parameterization = "eps"
|
||||
if "adm_in_channels" in unet_config:
|
||||
self.adm_channels = unet_config["adm_in_channels"]
|
||||
else:
|
||||
|
||||
self.adm_channels = unet_config.get("adm_in_channels", None)
|
||||
if self.adm_channels is None:
|
||||
self.adm_channels = 0
|
||||
print("v_prediction", v_prediction)
|
||||
print("adm", self.adm_channels)
|
||||
@ -55,9 +58,35 @@ class BaseModel(torch.nn.Module):
|
||||
def is_adm(self):
|
||||
return self.adm_channels > 0
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
return None
|
||||
|
||||
def load_model_weights(self, sd, unet_prefix=""):
|
||||
to_load = {}
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
if k.startswith(unet_prefix):
|
||||
to_load[k[len(unet_prefix):]] = sd.pop(k)
|
||||
|
||||
m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
|
||||
if len(m) > 0:
|
||||
print("unet missing:", m)
|
||||
|
||||
if len(u) > 0:
|
||||
print("unet unexpected:", u)
|
||||
del to_load
|
||||
return self
|
||||
|
||||
def process_latent_in(self, latent):
|
||||
return self.latent_format.process_in(latent)
|
||||
|
||||
def process_latent_out(self, latent):
|
||||
return self.latent_format.process_out(latent)
|
||||
|
||||
|
||||
class SD21UNCLIP(BaseModel):
|
||||
def __init__(self, unet_config, noise_aug_config, v_prediction=True):
|
||||
super().__init__(unet_config, v_prediction)
|
||||
def __init__(self, model_config, noise_aug_config, v_prediction=True):
|
||||
super().__init__(model_config, v_prediction)
|
||||
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
@ -92,6 +121,58 @@ class SD21UNCLIP(BaseModel):
|
||||
return adm_out
|
||||
|
||||
class SDInpaint(BaseModel):
|
||||
def __init__(self, unet_config, v_prediction=False):
|
||||
super().__init__(unet_config, v_prediction)
|
||||
def __init__(self, model_config, v_prediction=False):
|
||||
super().__init__(model_config, v_prediction)
|
||||
self.concat_keys = ("mask", "masked_image")
|
||||
|
||||
class SDXLRefiner(BaseModel):
|
||||
def __init__(self, model_config, v_prediction=False):
|
||||
super().__init__(model_config, v_prediction)
|
||||
self.embedder = Timestep(256)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
clip_pooled = kwargs["pooled_output"]
|
||||
width = kwargs.get("width", 768)
|
||||
height = kwargs.get("height", 768)
|
||||
crop_w = kwargs.get("crop_w", 0)
|
||||
crop_h = kwargs.get("crop_h", 0)
|
||||
|
||||
if kwargs.get("prompt_type", "") == "negative":
|
||||
aesthetic_score = kwargs.get("aesthetic_score", 2.5)
|
||||
else:
|
||||
aesthetic_score = kwargs.get("aesthetic_score", 6)
|
||||
|
||||
print(clip_pooled.shape, width, height, crop_w, crop_h, aesthetic_score)
|
||||
out = []
|
||||
out.append(self.embedder(torch.Tensor([width])))
|
||||
out.append(self.embedder(torch.Tensor([height])))
|
||||
out.append(self.embedder(torch.Tensor([crop_w])))
|
||||
out.append(self.embedder(torch.Tensor([crop_h])))
|
||||
out.append(self.embedder(torch.Tensor([aesthetic_score])))
|
||||
flat = torch.flatten(torch.cat(out))[None, ]
|
||||
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
||||
|
||||
class SDXL(BaseModel):
|
||||
def __init__(self, model_config, v_prediction=False):
|
||||
super().__init__(model_config, v_prediction)
|
||||
self.embedder = Timestep(256)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
clip_pooled = kwargs["pooled_output"]
|
||||
width = kwargs.get("width", 768)
|
||||
height = kwargs.get("height", 768)
|
||||
crop_w = kwargs.get("crop_w", 0)
|
||||
crop_h = kwargs.get("crop_h", 0)
|
||||
target_width = kwargs.get("target_width", width)
|
||||
target_height = kwargs.get("target_height", height)
|
||||
|
||||
print(clip_pooled.shape, width, height, crop_w, crop_h, target_width, target_height)
|
||||
out = []
|
||||
out.append(self.embedder(torch.Tensor([width])))
|
||||
out.append(self.embedder(torch.Tensor([height])))
|
||||
out.append(self.embedder(torch.Tensor([crop_w])))
|
||||
out.append(self.embedder(torch.Tensor([crop_h])))
|
||||
out.append(self.embedder(torch.Tensor([target_width])))
|
||||
out.append(self.embedder(torch.Tensor([target_height])))
|
||||
flat = torch.flatten(torch.cat(out))[None, ]
|
||||
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
||||
|
||||
120
comfy/model_detection.py
Normal file
120
comfy/model_detection.py
Normal file
@ -0,0 +1,120 @@
|
||||
|
||||
from . import supported_models
|
||||
|
||||
def count_blocks(state_dict_keys, prefix_string):
|
||||
count = 0
|
||||
while True:
|
||||
c = False
|
||||
for k in state_dict_keys:
|
||||
if k.startswith(prefix_string.format(count)):
|
||||
c = True
|
||||
break
|
||||
if c == False:
|
||||
break
|
||||
count += 1
|
||||
return count
|
||||
|
||||
def detect_unet_config(state_dict, key_prefix, use_fp16):
|
||||
state_dict_keys = list(state_dict.keys())
|
||||
num_res_blocks = 2
|
||||
|
||||
unet_config = {
|
||||
"use_checkpoint": False,
|
||||
"image_size": 32,
|
||||
"out_channels": 4,
|
||||
"num_res_blocks": num_res_blocks,
|
||||
"use_spatial_transformer": True,
|
||||
"legacy": False
|
||||
}
|
||||
|
||||
y_input = '{}label_emb.0.0.weight'.format(key_prefix)
|
||||
if y_input in state_dict_keys:
|
||||
unet_config["num_classes"] = "sequential"
|
||||
unet_config["adm_in_channels"] = state_dict[y_input].shape[1]
|
||||
else:
|
||||
unet_config["adm_in_channels"] = None
|
||||
|
||||
unet_config["use_fp16"] = use_fp16
|
||||
model_channels = state_dict['{}input_blocks.0.0.weight'.format(key_prefix)].shape[0]
|
||||
in_channels = state_dict['{}input_blocks.0.0.weight'.format(key_prefix)].shape[1]
|
||||
|
||||
num_res_blocks = []
|
||||
channel_mult = []
|
||||
attention_resolutions = []
|
||||
transformer_depth = []
|
||||
context_dim = None
|
||||
use_linear_in_transformer = False
|
||||
|
||||
|
||||
current_res = 1
|
||||
count = 0
|
||||
|
||||
last_res_blocks = 0
|
||||
last_transformer_depth = 0
|
||||
last_channel_mult = 0
|
||||
|
||||
while True:
|
||||
prefix = '{}input_blocks.{}.'.format(key_prefix, count)
|
||||
block_keys = sorted(list(filter(lambda a: a.startswith(prefix), state_dict_keys)))
|
||||
if len(block_keys) == 0:
|
||||
break
|
||||
|
||||
if "{}0.op.weight".format(prefix) in block_keys: #new layer
|
||||
if last_transformer_depth > 0:
|
||||
attention_resolutions.append(current_res)
|
||||
transformer_depth.append(last_transformer_depth)
|
||||
num_res_blocks.append(last_res_blocks)
|
||||
channel_mult.append(last_channel_mult)
|
||||
|
||||
current_res *= 2
|
||||
last_res_blocks = 0
|
||||
last_transformer_depth = 0
|
||||
last_channel_mult = 0
|
||||
else:
|
||||
res_block_prefix = "{}0.in_layers.0.weight".format(prefix)
|
||||
if res_block_prefix in block_keys:
|
||||
last_res_blocks += 1
|
||||
last_channel_mult = state_dict["{}0.out_layers.3.weight".format(prefix)].shape[0] // model_channels
|
||||
|
||||
transformer_prefix = prefix + "1.transformer_blocks."
|
||||
transformer_keys = sorted(list(filter(lambda a: a.startswith(transformer_prefix), state_dict_keys)))
|
||||
if len(transformer_keys) > 0:
|
||||
last_transformer_depth = count_blocks(state_dict_keys, transformer_prefix + '{}')
|
||||
if context_dim is None:
|
||||
context_dim = state_dict['{}0.attn2.to_k.weight'.format(transformer_prefix)].shape[1]
|
||||
use_linear_in_transformer = len(state_dict['{}1.proj_in.weight'.format(prefix)].shape) == 2
|
||||
|
||||
count += 1
|
||||
|
||||
if last_transformer_depth > 0:
|
||||
attention_resolutions.append(current_res)
|
||||
transformer_depth.append(last_transformer_depth)
|
||||
num_res_blocks.append(last_res_blocks)
|
||||
channel_mult.append(last_channel_mult)
|
||||
transformer_depth_middle = count_blocks(state_dict_keys, '{}middle_block.1.transformer_blocks.'.format(key_prefix) + '{}')
|
||||
|
||||
if len(set(num_res_blocks)) == 1:
|
||||
num_res_blocks = num_res_blocks[0]
|
||||
|
||||
if len(set(transformer_depth)) == 1:
|
||||
transformer_depth = transformer_depth[0]
|
||||
|
||||
unet_config["in_channels"] = in_channels
|
||||
unet_config["model_channels"] = model_channels
|
||||
unet_config["num_res_blocks"] = num_res_blocks
|
||||
unet_config["attention_resolutions"] = attention_resolutions
|
||||
unet_config["transformer_depth"] = transformer_depth
|
||||
unet_config["channel_mult"] = channel_mult
|
||||
unet_config["transformer_depth_middle"] = transformer_depth_middle
|
||||
unet_config['use_linear_in_transformer'] = use_linear_in_transformer
|
||||
unet_config["context_dim"] = context_dim
|
||||
return unet_config
|
||||
|
||||
|
||||
def model_config_from_unet(state_dict, unet_key_prefix, use_fp16):
|
||||
unet_config = detect_unet_config(state_dict, unet_key_prefix, use_fp16)
|
||||
for model_config in supported_models.models:
|
||||
if model_config.matches(unet_config):
|
||||
return model_config(unet_config)
|
||||
|
||||
return None
|
||||
@ -65,7 +65,7 @@ def cleanup_additional_models(models):
|
||||
for m in models:
|
||||
m.cleanup()
|
||||
|
||||
def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False):
|
||||
def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
|
||||
device = comfy.model_management.get_torch_device()
|
||||
|
||||
if noise_mask is not None:
|
||||
@ -85,7 +85,7 @@ def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative
|
||||
|
||||
sampler = comfy.samplers.KSampler(real_model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
|
||||
|
||||
samples = sampler.sample(noise, positive_copy, negative_copy, cfg=cfg, latent_image=latent_image, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar)
|
||||
samples = sampler.sample(noise, positive_copy, negative_copy, cfg=cfg, latent_image=latent_image, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed)
|
||||
samples = samples.cpu()
|
||||
|
||||
cleanup_additional_models(models)
|
||||
|
||||
@ -13,7 +13,7 @@ def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
|
||||
|
||||
#The main sampling function shared by all the samplers
|
||||
#Returns predicted noise
|
||||
def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, cond_concat=None, model_options={}):
|
||||
def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, cond_concat=None, model_options={}, seed=None):
|
||||
def get_area_and_mult(cond, x_in, cond_concat_in, timestep_in):
|
||||
area = (x_in.shape[2], x_in.shape[3], 0, 0)
|
||||
strength = 1.0
|
||||
@ -229,7 +229,7 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
|
||||
timestep_ = torch.cat([timestep] * batch_chunks)
|
||||
|
||||
if control is not None:
|
||||
c['control'] = control.get_control(input_x, timestep_, c['c_crossattn'], len(cond_or_uncond))
|
||||
c['control'] = control.get_control(input_x, timestep_, c, len(cond_or_uncond))
|
||||
|
||||
transformer_options = {}
|
||||
if 'transformer_options' in model_options:
|
||||
@ -292,8 +292,8 @@ class CFGNoisePredictor(torch.nn.Module):
|
||||
super().__init__()
|
||||
self.inner_model = model
|
||||
self.alphas_cumprod = model.alphas_cumprod
|
||||
def apply_model(self, x, timestep, cond, uncond, cond_scale, cond_concat=None, model_options={}):
|
||||
out = sampling_function(self.inner_model.apply_model, x, timestep, uncond, cond, cond_scale, cond_concat, model_options=model_options)
|
||||
def apply_model(self, x, timestep, cond, uncond, cond_scale, cond_concat=None, model_options={}, seed=None):
|
||||
out = sampling_function(self.inner_model.apply_model, x, timestep, uncond, cond, cond_scale, cond_concat, model_options=model_options, seed=seed)
|
||||
return out
|
||||
|
||||
|
||||
@ -301,11 +301,11 @@ class KSamplerX0Inpaint(torch.nn.Module):
|
||||
def __init__(self, model):
|
||||
super().__init__()
|
||||
self.inner_model = model
|
||||
def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, cond_concat=None, model_options={}):
|
||||
def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, cond_concat=None, model_options={}, seed=None):
|
||||
if denoise_mask is not None:
|
||||
latent_mask = 1. - denoise_mask
|
||||
x = x * denoise_mask + (self.latent_image + self.noise * sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1))) * latent_mask
|
||||
out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, cond_concat=cond_concat, model_options=model_options)
|
||||
out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, cond_concat=cond_concat, model_options=model_options, seed=seed)
|
||||
if denoise_mask is not None:
|
||||
out *= denoise_mask
|
||||
|
||||
@ -460,8 +460,7 @@ def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func):
|
||||
n[name] = uncond_fill_func(cond_cnets, x)
|
||||
uncond[temp[1]] = [o[0], n]
|
||||
|
||||
|
||||
def encode_adm(model, conds, batch_size, device):
|
||||
def encode_adm(model, conds, batch_size, width, height, device, prompt_type):
|
||||
for t in range(len(conds)):
|
||||
x = conds[t]
|
||||
adm_out = None
|
||||
@ -469,7 +468,11 @@ def encode_adm(model, conds, batch_size, device):
|
||||
adm_out = x[1]["adm"]
|
||||
else:
|
||||
params = x[1].copy()
|
||||
params["width"] = params.get("width", width * 8)
|
||||
params["height"] = params.get("height", height * 8)
|
||||
params["prompt_type"] = params.get("prompt_type", prompt_type)
|
||||
adm_out = model.encode_adm(device=device, **params)
|
||||
|
||||
if adm_out is not None:
|
||||
x[1] = x[1].copy()
|
||||
x[1]["adm_encoded"] = torch.cat([adm_out] * batch_size).to(device)
|
||||
@ -539,7 +542,7 @@ class KSampler:
|
||||
sigmas = self.calculate_sigmas(new_steps).to(self.device)
|
||||
self.sigmas = sigmas[-(steps + 1):]
|
||||
|
||||
def sample(self, noise, positive, negative, cfg, latent_image=None, start_step=None, last_step=None, force_full_denoise=False, denoise_mask=None, sigmas=None, callback=None, disable_pbar=False):
|
||||
def sample(self, noise, positive, negative, cfg, latent_image=None, start_step=None, last_step=None, force_full_denoise=False, denoise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
|
||||
if sigmas is None:
|
||||
sigmas = self.sigmas
|
||||
sigma_min = self.sigma_min
|
||||
@ -580,10 +583,13 @@ class KSampler:
|
||||
precision_scope = contextlib.nullcontext
|
||||
|
||||
if self.model.is_adm():
|
||||
positive = encode_adm(self.model, positive, noise.shape[0], self.device)
|
||||
negative = encode_adm(self.model, negative, noise.shape[0], self.device)
|
||||
positive = encode_adm(self.model, positive, noise.shape[0], noise.shape[3], noise.shape[2], self.device, "positive")
|
||||
negative = encode_adm(self.model, negative, noise.shape[0], noise.shape[3], noise.shape[2], self.device, "negative")
|
||||
|
||||
extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": self.model_options}
|
||||
if latent_image is not None:
|
||||
latent_image = self.model.process_latent_in(latent_image)
|
||||
|
||||
extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": self.model_options, "seed":seed}
|
||||
|
||||
cond_concat = None
|
||||
if hasattr(self.model, 'concat_keys'): #inpaint
|
||||
@ -669,4 +675,4 @@ class KSampler:
|
||||
else:
|
||||
samples = getattr(k_diffusion_sampling, "sample_{}".format(self.sampler))(self.model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar)
|
||||
|
||||
return samples.to(torch.float32)
|
||||
return self.model.process_latent_out(samples.to(torch.float32))
|
||||
|
||||
476
comfy/sd.py
476
comfy/sd.py
@ -3,8 +3,6 @@ import contextlib
|
||||
import copy
|
||||
import inspect
|
||||
|
||||
from . import sd1_clip
|
||||
from . import sd2_clip
|
||||
from comfy import model_management
|
||||
from .ldm.util import instantiate_from_config
|
||||
from .ldm.models.autoencoder import AutoencoderKL
|
||||
@ -17,19 +15,29 @@ from . import clip_vision
|
||||
from . import gligen
|
||||
from . import diffusers_convert
|
||||
from . import model_base
|
||||
from . import model_detection
|
||||
|
||||
def load_model_weights(model, sd, verbose=False, load_state_dict_to=[]):
|
||||
replace_prefix = {"model.diffusion_model.": "diffusion_model."}
|
||||
for rp in replace_prefix:
|
||||
replace = list(map(lambda a: (a, "{}{}".format(replace_prefix[rp], a[len(rp):])), filter(lambda a: a.startswith(rp), sd.keys())))
|
||||
for x in replace:
|
||||
sd[x[1]] = sd.pop(x[0])
|
||||
from . import sd1_clip
|
||||
from . import sd2_clip
|
||||
from . import sdxl_clip
|
||||
|
||||
def load_model_weights(model, sd):
|
||||
m, u = model.load_state_dict(sd, strict=False)
|
||||
m = set(m)
|
||||
unexpected_keys = set(u)
|
||||
|
||||
k = list(sd.keys())
|
||||
for x in k:
|
||||
# print(x)
|
||||
if x not in unexpected_keys:
|
||||
w = sd.pop(x)
|
||||
del w
|
||||
if len(m) > 0:
|
||||
print("missing", m)
|
||||
return model
|
||||
|
||||
def load_clip_weights(model, sd):
|
||||
k = list(sd.keys())
|
||||
for x in k:
|
||||
if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
|
||||
y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
|
||||
sd[y] = sd.pop(x)
|
||||
@ -39,20 +47,8 @@ def load_model_weights(model, sd, verbose=False, load_state_dict_to=[]):
|
||||
if ids.dtype == torch.float32:
|
||||
sd['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round()
|
||||
|
||||
sd = utils.transformers_convert(sd, "cond_stage_model.model", "cond_stage_model.transformer.text_model", 24)
|
||||
|
||||
for x in load_state_dict_to:
|
||||
x.load_state_dict(sd, strict=False)
|
||||
|
||||
if len(m) > 0 and verbose:
|
||||
print("missing keys:")
|
||||
print(m)
|
||||
if len(u) > 0 and verbose:
|
||||
print("unexpected keys:")
|
||||
print(u)
|
||||
|
||||
model.eval()
|
||||
return model
|
||||
sd = utils.transformers_convert(sd, "cond_stage_model.model.", "cond_stage_model.transformer.text_model.", 24)
|
||||
return load_model_weights(model, sd)
|
||||
|
||||
LORA_CLIP_MAP = {
|
||||
"mlp.fc1": "mlp_fc1",
|
||||
@ -66,18 +62,26 @@ LORA_CLIP_MAP = {
|
||||
LORA_UNET_MAP_ATTENTIONS = {
|
||||
"proj_in": "proj_in",
|
||||
"proj_out": "proj_out",
|
||||
"transformer_blocks.0.attn1.to_q": "transformer_blocks_0_attn1_to_q",
|
||||
"transformer_blocks.0.attn1.to_k": "transformer_blocks_0_attn1_to_k",
|
||||
"transformer_blocks.0.attn1.to_v": "transformer_blocks_0_attn1_to_v",
|
||||
"transformer_blocks.0.attn1.to_out.0": "transformer_blocks_0_attn1_to_out_0",
|
||||
"transformer_blocks.0.attn2.to_q": "transformer_blocks_0_attn2_to_q",
|
||||
"transformer_blocks.0.attn2.to_k": "transformer_blocks_0_attn2_to_k",
|
||||
"transformer_blocks.0.attn2.to_v": "transformer_blocks_0_attn2_to_v",
|
||||
"transformer_blocks.0.attn2.to_out.0": "transformer_blocks_0_attn2_to_out_0",
|
||||
"transformer_blocks.0.ff.net.0.proj": "transformer_blocks_0_ff_net_0_proj",
|
||||
"transformer_blocks.0.ff.net.2": "transformer_blocks_0_ff_net_2",
|
||||
}
|
||||
|
||||
transformer_lora_blocks = {
|
||||
"transformer_blocks.{}.attn1.to_q": "transformer_blocks_{}_attn1_to_q",
|
||||
"transformer_blocks.{}.attn1.to_k": "transformer_blocks_{}_attn1_to_k",
|
||||
"transformer_blocks.{}.attn1.to_v": "transformer_blocks_{}_attn1_to_v",
|
||||
"transformer_blocks.{}.attn1.to_out.0": "transformer_blocks_{}_attn1_to_out_0",
|
||||
"transformer_blocks.{}.attn2.to_q": "transformer_blocks_{}_attn2_to_q",
|
||||
"transformer_blocks.{}.attn2.to_k": "transformer_blocks_{}_attn2_to_k",
|
||||
"transformer_blocks.{}.attn2.to_v": "transformer_blocks_{}_attn2_to_v",
|
||||
"transformer_blocks.{}.attn2.to_out.0": "transformer_blocks_{}_attn2_to_out_0",
|
||||
"transformer_blocks.{}.ff.net.0.proj": "transformer_blocks_{}_ff_net_0_proj",
|
||||
"transformer_blocks.{}.ff.net.2": "transformer_blocks_{}_ff_net_2",
|
||||
}
|
||||
|
||||
for i in range(10):
|
||||
for k in transformer_lora_blocks:
|
||||
LORA_UNET_MAP_ATTENTIONS[k.format(i)] = transformer_lora_blocks[k].format(i)
|
||||
|
||||
|
||||
LORA_UNET_MAP_RESNET = {
|
||||
"in_layers.2": "resnets_{}_conv1",
|
||||
"emb_layers.1": "resnets_{}_time_emb_proj",
|
||||
@ -281,6 +285,11 @@ def model_lora_keys(model, key_map={}):
|
||||
if key_in:
|
||||
counter += 1
|
||||
|
||||
for k in sdk:
|
||||
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
|
||||
key_map["lora_unet_{}".format(key_lora)] = k
|
||||
|
||||
return key_map
|
||||
|
||||
|
||||
@ -312,9 +321,6 @@ class ModelPatcher:
|
||||
n.model_keys = self.model_keys
|
||||
return n
|
||||
|
||||
def set_model_tomesd(self, ratio):
|
||||
self.model_options["transformer_options"]["tomesd"] = {"ratio": ratio}
|
||||
|
||||
def set_model_sampler_cfg_function(self, sampler_cfg_function):
|
||||
if len(inspect.signature(sampler_cfg_function).parameters) == 3:
|
||||
self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way
|
||||
@ -327,12 +333,29 @@ class ModelPatcher:
|
||||
to["patches"] = {}
|
||||
to["patches"][name] = to["patches"].get(name, []) + [patch]
|
||||
|
||||
def set_model_patch_replace(self, patch, name, block_name, number):
|
||||
to = self.model_options["transformer_options"]
|
||||
if "patches_replace" not in to:
|
||||
to["patches_replace"] = {}
|
||||
if name not in to["patches_replace"]:
|
||||
to["patches_replace"][name] = {}
|
||||
to["patches_replace"][name][(block_name, number)] = patch
|
||||
|
||||
def set_model_attn1_patch(self, patch):
|
||||
self.set_model_patch(patch, "attn1_patch")
|
||||
|
||||
def set_model_attn2_patch(self, patch):
|
||||
self.set_model_patch(patch, "attn2_patch")
|
||||
|
||||
def set_model_attn1_replace(self, patch, block_name, number):
|
||||
self.set_model_patch_replace(patch, "attn1", block_name, number)
|
||||
|
||||
def set_model_attn2_replace(self, patch, block_name, number):
|
||||
self.set_model_patch_replace(patch, "attn2", block_name, number)
|
||||
|
||||
def set_model_attn1_output_patch(self, patch):
|
||||
self.set_model_patch(patch, "attn1_output_patch")
|
||||
|
||||
def set_model_attn2_output_patch(self, patch):
|
||||
self.set_model_patch(patch, "attn2_output_patch")
|
||||
|
||||
@ -345,6 +368,13 @@ class ModelPatcher:
|
||||
for i in range(len(patch_list)):
|
||||
if hasattr(patch_list[i], "to"):
|
||||
patch_list[i] = patch_list[i].to(device)
|
||||
if "patches_replace" in to:
|
||||
patches = to["patches_replace"]
|
||||
for name in patches:
|
||||
patch_list = patches[name]
|
||||
for k in patch_list:
|
||||
if hasattr(patch_list[k], "to"):
|
||||
patch_list[k] = patch_list[k].to(device)
|
||||
|
||||
def model_dtype(self):
|
||||
return self.model.get_dtype()
|
||||
@ -387,7 +417,11 @@ class ModelPatcher:
|
||||
weight *= strength_model
|
||||
|
||||
if len(v) == 1:
|
||||
weight += alpha * (v[0]).type(weight.dtype).to(weight.device)
|
||||
w1 = v[0]
|
||||
if w1.shape != weight.shape:
|
||||
print("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape))
|
||||
else:
|
||||
weight += alpha * w1.type(weight.dtype).to(weight.device)
|
||||
elif len(v) == 4: #lora/locon
|
||||
mat1 = v[0]
|
||||
mat2 = v[1]
|
||||
@ -470,21 +504,12 @@ def load_lora_for_models(model, clip, lora_path, strength_model, strength_clip):
|
||||
|
||||
|
||||
class CLIP:
|
||||
def __init__(self, config={}, embedding_directory=None, no_init=False):
|
||||
def __init__(self, target=None, embedding_directory=None, no_init=False):
|
||||
if no_init:
|
||||
return
|
||||
self.target_clip = config["target"]
|
||||
if "params" in config:
|
||||
params = config["params"]
|
||||
else:
|
||||
params = {}
|
||||
|
||||
if self.target_clip.endswith("FrozenOpenCLIPEmbedder"):
|
||||
clip = sd2_clip.SD2ClipModel
|
||||
tokenizer = sd2_clip.SD2Tokenizer
|
||||
elif self.target_clip.endswith("FrozenCLIPEmbedder"):
|
||||
clip = sd1_clip.SD1ClipModel
|
||||
tokenizer = sd1_clip.SD1Tokenizer
|
||||
params = target.params
|
||||
clip = target.clip
|
||||
tokenizer = target.tokenizer
|
||||
|
||||
self.device = model_management.text_encoder_device()
|
||||
params["device"] = self.device
|
||||
@ -497,15 +522,15 @@ class CLIP:
|
||||
|
||||
def clone(self):
|
||||
n = CLIP(no_init=True)
|
||||
n.target_clip = self.target_clip
|
||||
n.patcher = self.patcher.clone()
|
||||
n.cond_stage_model = self.cond_stage_model
|
||||
n.tokenizer = self.tokenizer
|
||||
n.layer_idx = self.layer_idx
|
||||
n.device = self.device
|
||||
return n
|
||||
|
||||
def load_from_state_dict(self, sd):
|
||||
self.cond_stage_model.transformer.load_state_dict(sd, strict=False)
|
||||
self.cond_stage_model.load_sd(sd)
|
||||
|
||||
def add_patches(self, patches, strength=1.0):
|
||||
return self.patcher.add_patches(patches, strength)
|
||||
@ -521,23 +546,26 @@ class CLIP:
|
||||
self.cond_stage_model.clip_layer(self.layer_idx)
|
||||
try:
|
||||
self.patcher.patch_model()
|
||||
cond = self.cond_stage_model.encode_token_weights(tokens)
|
||||
cond, pooled = self.cond_stage_model.encode_token_weights(tokens)
|
||||
self.patcher.unpatch_model()
|
||||
except Exception as e:
|
||||
self.patcher.unpatch_model()
|
||||
raise e
|
||||
|
||||
cond_out = cond
|
||||
if return_pooled:
|
||||
eos_token_index = max(range(len(tokens[0])), key=tokens[0].__getitem__)
|
||||
pooled = cond[:, eos_token_index]
|
||||
return cond, pooled
|
||||
return cond
|
||||
return cond_out, pooled
|
||||
return cond_out
|
||||
|
||||
def encode(self, text):
|
||||
tokens = self.tokenize(text)
|
||||
return self.encode_from_tokens(tokens)
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.cond_stage_model.load_sd(sd)
|
||||
|
||||
class VAE:
|
||||
def __init__(self, ckpt_path=None, scale_factor=0.18215, device=None, config=None):
|
||||
def __init__(self, ckpt_path=None, device=None, config=None):
|
||||
if config is None:
|
||||
#default SD1.x/SD2.x VAE parameters
|
||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||
@ -551,7 +579,6 @@ class VAE:
|
||||
sd = diffusers_convert.convert_vae_state_dict(sd)
|
||||
self.first_stage_model.load_state_dict(sd, strict=False)
|
||||
|
||||
self.scale_factor = scale_factor
|
||||
if device is None:
|
||||
device = model_management.get_torch_device()
|
||||
self.device = device
|
||||
@ -562,7 +589,7 @@ class VAE:
|
||||
steps += samples.shape[0] * utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x * 2, tile_y // 2, overlap)
|
||||
pbar = utils.ProgressBar(steps)
|
||||
|
||||
decode_fn = lambda a: (self.first_stage_model.decode(1. / self.scale_factor * a.to(self.device)) + 1.0)
|
||||
decode_fn = lambda a: (self.first_stage_model.decode(a.to(self.device)) + 1.0)
|
||||
output = torch.clamp((
|
||||
(utils.tiled_scale(samples, decode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = 8, pbar = pbar) +
|
||||
utils.tiled_scale(samples, decode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = 8, pbar = pbar) +
|
||||
@ -576,7 +603,7 @@ class VAE:
|
||||
steps += pixel_samples.shape[0] * utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x * 2, tile_y // 2, overlap)
|
||||
pbar = utils.ProgressBar(steps)
|
||||
|
||||
encode_fn = lambda a: self.first_stage_model.encode(2. * a.to(self.device) - 1.).sample() * self.scale_factor
|
||||
encode_fn = lambda a: self.first_stage_model.encode(2. * a.to(self.device) - 1.).sample()
|
||||
samples = utils.tiled_scale(pixel_samples, encode_fn, tile_x, tile_y, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
|
||||
samples += utils.tiled_scale(pixel_samples, encode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
|
||||
samples += utils.tiled_scale(pixel_samples, encode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
|
||||
@ -594,7 +621,7 @@ class VAE:
|
||||
pixel_samples = torch.empty((samples_in.shape[0], 3, round(samples_in.shape[2] * 8), round(samples_in.shape[3] * 8)), device="cpu")
|
||||
for x in range(0, samples_in.shape[0], batch_number):
|
||||
samples = samples_in[x:x+batch_number].to(self.device)
|
||||
pixel_samples[x:x+batch_number] = torch.clamp((self.first_stage_model.decode(1. / self.scale_factor * samples) + 1.0) / 2.0, min=0.0, max=1.0).cpu()
|
||||
pixel_samples[x:x+batch_number] = torch.clamp((self.first_stage_model.decode(samples) + 1.0) / 2.0, min=0.0, max=1.0).cpu()
|
||||
except model_management.OOM_EXCEPTION as e:
|
||||
print("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
|
||||
pixel_samples = self.decode_tiled_(samples_in)
|
||||
@ -621,7 +648,7 @@ class VAE:
|
||||
samples = torch.empty((pixel_samples.shape[0], 4, round(pixel_samples.shape[2] // 8), round(pixel_samples.shape[3] // 8)), device="cpu")
|
||||
for x in range(0, pixel_samples.shape[0], batch_number):
|
||||
pixels_in = (2. * pixel_samples[x:x+batch_number] - 1.).to(self.device)
|
||||
samples[x:x+batch_number] = self.first_stage_model.encode(pixels_in).sample().cpu() * self.scale_factor
|
||||
samples[x:x+batch_number] = self.first_stage_model.encode(pixels_in).sample().cpu()
|
||||
|
||||
except model_management.OOM_EXCEPTION as e:
|
||||
print("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
|
||||
@ -668,10 +695,10 @@ class ControlNet:
|
||||
self.previous_controlnet = None
|
||||
self.global_average_pooling = global_average_pooling
|
||||
|
||||
def get_control(self, x_noisy, t, cond_txt, batched_number):
|
||||
def get_control(self, x_noisy, t, cond, batched_number):
|
||||
control_prev = None
|
||||
if self.previous_controlnet is not None:
|
||||
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond_txt, batched_number)
|
||||
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
|
||||
|
||||
output_dtype = x_noisy.dtype
|
||||
if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
|
||||
@ -689,7 +716,9 @@ class ControlNet:
|
||||
|
||||
with precision_scope(model_management.get_autocast_device(self.device)):
|
||||
self.control_model = model_management.load_if_low_vram(self.control_model)
|
||||
control = self.control_model(x=x_noisy, hint=self.cond_hint, timesteps=t, context=cond_txt)
|
||||
context = torch.cat(cond['c_crossattn'], 1)
|
||||
y = cond.get('c_adm', None)
|
||||
control = self.control_model(x=x_noisy, hint=self.cond_hint, timesteps=t, context=context, y=y)
|
||||
self.control_model = model_management.unload_if_low_vram(self.control_model)
|
||||
out = {'middle':[], 'output': []}
|
||||
autocast_enabled = torch.is_autocast_enabled()
|
||||
@ -749,60 +778,28 @@ class ControlNet:
|
||||
|
||||
def load_controlnet(ckpt_path, model=None):
|
||||
controlnet_data = utils.load_torch_file(ckpt_path, safe_load=True)
|
||||
pth_key = 'control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight'
|
||||
pth_key = 'control_model.zero_convs.0.0.weight'
|
||||
pth = False
|
||||
sd2 = False
|
||||
key = 'input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight'
|
||||
key = 'zero_convs.0.0.weight'
|
||||
if pth_key in controlnet_data:
|
||||
pth = True
|
||||
key = pth_key
|
||||
prefix = "control_model."
|
||||
elif key in controlnet_data:
|
||||
pass
|
||||
prefix = ""
|
||||
else:
|
||||
net = load_t2i_adapter(controlnet_data)
|
||||
if net is None:
|
||||
print("error checkpoint does not contain controlnet or t2i adapter data", ckpt_path)
|
||||
return net
|
||||
|
||||
context_dim = controlnet_data[key].shape[1]
|
||||
use_fp16 = model_management.should_use_fp16()
|
||||
|
||||
use_fp16 = False
|
||||
if model_management.should_use_fp16() and controlnet_data[key].dtype == torch.float16:
|
||||
use_fp16 = True
|
||||
controlnet_config = model_detection.model_config_from_unet(controlnet_data, prefix, use_fp16).unet_config
|
||||
controlnet_config.pop("out_channels")
|
||||
controlnet_config["hint_channels"] = 3
|
||||
control_model = cldm.ControlNet(**controlnet_config)
|
||||
|
||||
if context_dim == 768:
|
||||
#SD1.x
|
||||
control_model = cldm.ControlNet(image_size=32,
|
||||
in_channels=4,
|
||||
hint_channels=3,
|
||||
model_channels=320,
|
||||
attention_resolutions=[ 4, 2, 1 ],
|
||||
num_res_blocks=2,
|
||||
channel_mult=[ 1, 2, 4, 4 ],
|
||||
num_heads=8,
|
||||
use_spatial_transformer=True,
|
||||
transformer_depth=1,
|
||||
context_dim=context_dim,
|
||||
use_checkpoint=False,
|
||||
legacy=False,
|
||||
use_fp16=use_fp16)
|
||||
else:
|
||||
#SD2.x
|
||||
control_model = cldm.ControlNet(image_size=32,
|
||||
in_channels=4,
|
||||
hint_channels=3,
|
||||
model_channels=320,
|
||||
attention_resolutions=[ 4, 2, 1 ],
|
||||
num_res_blocks=2,
|
||||
channel_mult=[ 1, 2, 4, 4 ],
|
||||
num_head_channels=64,
|
||||
use_spatial_transformer=True,
|
||||
use_linear_in_transformer=True,
|
||||
transformer_depth=1,
|
||||
context_dim=context_dim,
|
||||
use_checkpoint=False,
|
||||
legacy=False,
|
||||
use_fp16=use_fp16)
|
||||
if pth:
|
||||
if 'difference' in controlnet_data:
|
||||
if model is not None:
|
||||
@ -823,9 +820,10 @@ def load_controlnet(ckpt_path, model=None):
|
||||
pass
|
||||
w = WeightsLoader()
|
||||
w.control_model = control_model
|
||||
w.load_state_dict(controlnet_data, strict=False)
|
||||
missing, unexpected = w.load_state_dict(controlnet_data, strict=False)
|
||||
else:
|
||||
control_model.load_state_dict(controlnet_data, strict=False)
|
||||
missing, unexpected = control_model.load_state_dict(controlnet_data, strict=False)
|
||||
print(missing, unexpected)
|
||||
|
||||
if use_fp16:
|
||||
control_model = control_model.half()
|
||||
@ -850,10 +848,10 @@ class T2IAdapter:
|
||||
self.cond_hint_original = None
|
||||
self.cond_hint = None
|
||||
|
||||
def get_control(self, x_noisy, t, cond_txt, batched_number):
|
||||
def get_control(self, x_noisy, t, cond, batched_number):
|
||||
control_prev = None
|
||||
if self.previous_controlnet is not None:
|
||||
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond_txt, batched_number)
|
||||
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
|
||||
|
||||
if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
|
||||
if self.cond_hint is not None:
|
||||
@ -929,12 +927,21 @@ class T2IAdapter:
|
||||
|
||||
def load_t2i_adapter(t2i_data):
|
||||
keys = t2i_data.keys()
|
||||
if 'adapter' in keys:
|
||||
t2i_data = t2i_data['adapter']
|
||||
keys = t2i_data.keys()
|
||||
if "body.0.in_conv.weight" in keys:
|
||||
cin = t2i_data['body.0.in_conv.weight'].shape[1]
|
||||
model_ad = adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
|
||||
elif 'conv_in.weight' in keys:
|
||||
cin = t2i_data['conv_in.weight'].shape[1]
|
||||
model_ad = adapter.Adapter(cin=cin, channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False)
|
||||
channel = t2i_data['conv_in.weight'].shape[0]
|
||||
ksize = t2i_data['body.0.block2.weight'].shape[2]
|
||||
use_conv = False
|
||||
down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
|
||||
if len(down_opts) > 0:
|
||||
use_conv = True
|
||||
model_ad = adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv)
|
||||
else:
|
||||
return None
|
||||
model_ad.load_state_dict(t2i_data)
|
||||
@ -960,15 +967,42 @@ def load_style_model(ckpt_path):
|
||||
return StyleModel(model)
|
||||
|
||||
|
||||
def load_clip(ckpt_path, embedding_directory=None):
|
||||
clip_data = utils.load_torch_file(ckpt_path, safe_load=True)
|
||||
config = {}
|
||||
if "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data:
|
||||
config['target'] = 'comfy.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder'
|
||||
def load_clip(ckpt_paths, embedding_directory=None):
|
||||
clip_data = []
|
||||
for p in ckpt_paths:
|
||||
clip_data.append(utils.load_torch_file(p, safe_load=True))
|
||||
|
||||
class EmptyClass:
|
||||
pass
|
||||
|
||||
for i in range(len(clip_data)):
|
||||
if "transformer.resblocks.0.ln_1.weight" in clip_data[i]:
|
||||
clip_data[i] = utils.transformers_convert(clip_data[i], "", "text_model.", 32)
|
||||
|
||||
clip_target = EmptyClass()
|
||||
clip_target.params = {}
|
||||
if len(clip_data) == 1:
|
||||
if "text_model.encoder.layers.30.mlp.fc1.weight" in clip_data[0]:
|
||||
clip_target.clip = sdxl_clip.SDXLRefinerClipModel
|
||||
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
||||
elif "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data[0]:
|
||||
clip_target.clip = sd2_clip.SD2ClipModel
|
||||
clip_target.tokenizer = sd2_clip.SD2Tokenizer
|
||||
else:
|
||||
clip_target.clip = sd1_clip.SD1ClipModel
|
||||
clip_target.tokenizer = sd1_clip.SD1Tokenizer
|
||||
else:
|
||||
config['target'] = 'comfy.ldm.modules.encoders.modules.FrozenCLIPEmbedder'
|
||||
clip = CLIP(config=config, embedding_directory=embedding_directory)
|
||||
clip.load_from_state_dict(clip_data)
|
||||
clip_target.clip = sdxl_clip.SDXLClipModel
|
||||
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
||||
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory)
|
||||
for c in clip_data:
|
||||
m, u = clip.load_sd(c)
|
||||
if len(m) > 0:
|
||||
print("clip missing:", m)
|
||||
|
||||
if len(u) > 0:
|
||||
print("clip unexpected:", u)
|
||||
return clip
|
||||
|
||||
def load_gligen(ckpt_path):
|
||||
@ -979,6 +1013,7 @@ def load_gligen(ckpt_path):
|
||||
return model
|
||||
|
||||
def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_clip=True, embedding_directory=None, state_dict=None, config=None):
|
||||
#TODO: this function is a mess and should be removed eventually
|
||||
if config is None:
|
||||
with open(config_path, 'r') as stream:
|
||||
config = yaml.safe_load(stream)
|
||||
@ -1010,32 +1045,49 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
|
||||
class WeightsLoader(torch.nn.Module):
|
||||
pass
|
||||
|
||||
w = WeightsLoader()
|
||||
load_state_dict_to = []
|
||||
if output_vae:
|
||||
vae = VAE(scale_factor=scale_factor, config=vae_config)
|
||||
w.first_stage_model = vae.first_stage_model
|
||||
load_state_dict_to = [w]
|
||||
|
||||
if output_clip:
|
||||
clip = CLIP(config=clip_config, embedding_directory=embedding_directory)
|
||||
w.cond_stage_model = clip.cond_stage_model
|
||||
load_state_dict_to = [w]
|
||||
|
||||
if config['model']["target"].endswith("LatentInpaintDiffusion"):
|
||||
model = model_base.SDInpaint(unet_config, v_prediction=v_prediction)
|
||||
elif config['model']["target"].endswith("ImageEmbeddingConditionedLatentDiffusion"):
|
||||
model = model_base.SD21UNCLIP(unet_config, noise_aug_config["params"], v_prediction=v_prediction)
|
||||
else:
|
||||
model = model_base.BaseModel(unet_config, v_prediction=v_prediction)
|
||||
|
||||
if state_dict is None:
|
||||
state_dict = utils.load_torch_file(ckpt_path)
|
||||
model = load_model_weights(model, state_dict, verbose=False, load_state_dict_to=load_state_dict_to)
|
||||
|
||||
class EmptyClass:
|
||||
pass
|
||||
|
||||
model_config = EmptyClass()
|
||||
model_config.unet_config = unet_config
|
||||
from . import latent_formats
|
||||
model_config.latent_format = latent_formats.SD15(scale_factor=scale_factor)
|
||||
|
||||
if config['model']["target"].endswith("LatentInpaintDiffusion"):
|
||||
model = model_base.SDInpaint(model_config, v_prediction=v_prediction)
|
||||
elif config['model']["target"].endswith("ImageEmbeddingConditionedLatentDiffusion"):
|
||||
model = model_base.SD21UNCLIP(model_config, noise_aug_config["params"], v_prediction=v_prediction)
|
||||
else:
|
||||
model = model_base.BaseModel(model_config, v_prediction=v_prediction)
|
||||
|
||||
if fp16:
|
||||
model = model.half()
|
||||
|
||||
model.load_model_weights(state_dict, "model.diffusion_model.")
|
||||
|
||||
if output_vae:
|
||||
w = WeightsLoader()
|
||||
vae = VAE(config=vae_config)
|
||||
w.first_stage_model = vae.first_stage_model
|
||||
load_model_weights(w, state_dict)
|
||||
|
||||
if output_clip:
|
||||
w = WeightsLoader()
|
||||
clip_target = EmptyClass()
|
||||
clip_target.params = clip_config.get("params", {})
|
||||
if clip_config["target"].endswith("FrozenOpenCLIPEmbedder"):
|
||||
clip_target.clip = sd2_clip.SD2ClipModel
|
||||
clip_target.tokenizer = sd2_clip.SD2Tokenizer
|
||||
elif clip_config["target"].endswith("FrozenCLIPEmbedder"):
|
||||
clip_target.clip = sd1_clip.SD1ClipModel
|
||||
clip_target.tokenizer = sd1_clip.SD1Tokenizer
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory)
|
||||
w.cond_stage_model = clip.cond_stage_model
|
||||
load_clip_weights(w, state_dict)
|
||||
|
||||
return (ModelPatcher(model), clip, vae)
|
||||
|
||||
|
||||
@ -1045,139 +1097,41 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
|
||||
clip = None
|
||||
clipvision = None
|
||||
vae = None
|
||||
model = None
|
||||
clip_target = None
|
||||
|
||||
fp16 = model_management.should_use_fp16()
|
||||
|
||||
class WeightsLoader(torch.nn.Module):
|
||||
pass
|
||||
|
||||
w = WeightsLoader()
|
||||
load_state_dict_to = []
|
||||
model_config = model_detection.model_config_from_unet(sd, "model.diffusion_model.", fp16)
|
||||
if model_config is None:
|
||||
raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
|
||||
|
||||
if model_config.clip_vision_prefix is not None:
|
||||
if output_clipvision:
|
||||
clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix, True)
|
||||
|
||||
model = model_config.get_model(sd)
|
||||
model.load_model_weights(sd, "model.diffusion_model.")
|
||||
|
||||
if output_vae:
|
||||
vae = VAE()
|
||||
w = WeightsLoader()
|
||||
w.first_stage_model = vae.first_stage_model
|
||||
load_state_dict_to = [w]
|
||||
load_model_weights(w, sd)
|
||||
|
||||
if output_clip:
|
||||
clip_config = {}
|
||||
if "cond_stage_model.model.transformer.resblocks.22.attn.out_proj.weight" in sd_keys:
|
||||
clip_config['target'] = 'comfy.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder'
|
||||
else:
|
||||
clip_config['target'] = 'comfy.ldm.modules.encoders.modules.FrozenCLIPEmbedder'
|
||||
clip = CLIP(config=clip_config, embedding_directory=embedding_directory)
|
||||
w = WeightsLoader()
|
||||
clip_target = model_config.clip_target()
|
||||
clip = CLIP(clip_target, embedding_directory=embedding_directory)
|
||||
w.cond_stage_model = clip.cond_stage_model
|
||||
load_state_dict_to = [w]
|
||||
sd = model_config.process_clip_state_dict(sd)
|
||||
load_model_weights(w, sd)
|
||||
|
||||
clipvision_key = "embedder.model.visual.transformer.resblocks.0.attn.in_proj_weight"
|
||||
noise_aug_config = None
|
||||
if clipvision_key in sd_keys:
|
||||
size = sd[clipvision_key].shape[1]
|
||||
|
||||
if output_clipvision:
|
||||
clipvision = clip_vision.load_clipvision_from_sd(sd)
|
||||
|
||||
noise_aug_key = "noise_augmentor.betas"
|
||||
if noise_aug_key in sd_keys:
|
||||
noise_aug_config = {}
|
||||
params = {}
|
||||
noise_schedule_config = {}
|
||||
noise_schedule_config["timesteps"] = sd[noise_aug_key].shape[0]
|
||||
noise_schedule_config["beta_schedule"] = "squaredcos_cap_v2"
|
||||
params["noise_schedule_config"] = noise_schedule_config
|
||||
noise_aug_config['target'] = "comfy.ldm.modules.encoders.noise_aug_modules.CLIPEmbeddingNoiseAugmentation"
|
||||
if size == 1280: #h
|
||||
params["timestep_dim"] = 1024
|
||||
elif size == 1024: #l
|
||||
params["timestep_dim"] = 768
|
||||
noise_aug_config['params'] = params
|
||||
|
||||
sd_config = {
|
||||
"linear_start": 0.00085,
|
||||
"linear_end": 0.012,
|
||||
"num_timesteps_cond": 1,
|
||||
"log_every_t": 200,
|
||||
"timesteps": 1000,
|
||||
"first_stage_key": "jpg",
|
||||
"cond_stage_key": "txt",
|
||||
"image_size": 64,
|
||||
"channels": 4,
|
||||
"cond_stage_trainable": False,
|
||||
"monitor": "val/loss_simple_ema",
|
||||
"scale_factor": 0.18215,
|
||||
"use_ema": False,
|
||||
}
|
||||
|
||||
unet_config = {
|
||||
"use_checkpoint": False,
|
||||
"image_size": 32,
|
||||
"out_channels": 4,
|
||||
"attention_resolutions": [
|
||||
4,
|
||||
2,
|
||||
1
|
||||
],
|
||||
"num_res_blocks": 2,
|
||||
"channel_mult": [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
4
|
||||
],
|
||||
"use_spatial_transformer": True,
|
||||
"transformer_depth": 1,
|
||||
"legacy": False
|
||||
}
|
||||
|
||||
if len(sd['model.diffusion_model.input_blocks.4.1.proj_in.weight'].shape) == 2:
|
||||
unet_config['use_linear_in_transformer'] = True
|
||||
|
||||
unet_config["use_fp16"] = fp16
|
||||
unet_config["model_channels"] = sd['model.diffusion_model.input_blocks.0.0.weight'].shape[0]
|
||||
unet_config["in_channels"] = sd['model.diffusion_model.input_blocks.0.0.weight'].shape[1]
|
||||
unet_config["context_dim"] = sd['model.diffusion_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight'].shape[1]
|
||||
|
||||
sd_config["unet_config"] = {"target": "comfy.ldm.modules.diffusionmodules.openaimodel.UNetModel", "params": unet_config}
|
||||
|
||||
unclip_model = False
|
||||
inpaint_model = False
|
||||
if noise_aug_config is not None: #SD2.x unclip model
|
||||
sd_config["noise_aug_config"] = noise_aug_config
|
||||
sd_config["image_size"] = 96
|
||||
sd_config["embedding_dropout"] = 0.25
|
||||
sd_config["conditioning_key"] = 'crossattn-adm'
|
||||
unclip_model = True
|
||||
elif unet_config["in_channels"] > 4: #inpainting model
|
||||
sd_config["conditioning_key"] = "hybrid"
|
||||
sd_config["finetune_keys"] = None
|
||||
inpaint_model = True
|
||||
else:
|
||||
sd_config["conditioning_key"] = "crossattn"
|
||||
|
||||
if unet_config["context_dim"] == 768:
|
||||
unet_config["num_heads"] = 8 #SD1.x
|
||||
else:
|
||||
unet_config["num_head_channels"] = 64 #SD2.x
|
||||
|
||||
unclip = 'model.diffusion_model.label_emb.0.0.weight'
|
||||
if unclip in sd_keys:
|
||||
unet_config["num_classes"] = "sequential"
|
||||
unet_config["adm_in_channels"] = sd[unclip].shape[1]
|
||||
|
||||
v_prediction = False
|
||||
if unet_config["context_dim"] == 1024 and unet_config["in_channels"] == 4: #only SD2.x non inpainting models are v prediction
|
||||
k = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias"
|
||||
out = sd[k]
|
||||
if torch.std(out, unbiased=False) > 0.09: # not sure how well this will actually work. I guess we will find out.
|
||||
v_prediction = True
|
||||
sd_config["parameterization"] = 'v'
|
||||
|
||||
if inpaint_model:
|
||||
model = model_base.SDInpaint(unet_config, v_prediction=v_prediction)
|
||||
elif unclip_model:
|
||||
model = model_base.SD21UNCLIP(unet_config, noise_aug_config["params"], v_prediction=v_prediction)
|
||||
else:
|
||||
model = model_base.BaseModel(unet_config, v_prediction=v_prediction)
|
||||
|
||||
model = load_model_weights(model, sd, verbose=False, load_state_dict_to=load_state_dict_to)
|
||||
left_over = sd.keys()
|
||||
if len(left_over) > 0:
|
||||
print("left over keys:", left_over)
|
||||
|
||||
return (ModelPatcher(model), clip, vae, clipvision)
|
||||
|
||||
@ -8,11 +8,14 @@ import zipfile
|
||||
|
||||
class ClipTokenWeightEncoder:
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
z_empty = self.encode(self.empty_tokens)
|
||||
z_empty, _ = self.encode(self.empty_tokens)
|
||||
output = []
|
||||
first_pooled = None
|
||||
for x in token_weight_pairs:
|
||||
tokens = [list(map(lambda a: a[0], x))]
|
||||
z = self.encode(tokens)
|
||||
z, pooled = self.encode(tokens)
|
||||
if first_pooled is None:
|
||||
first_pooled = pooled
|
||||
for i in range(len(z)):
|
||||
for j in range(len(z[i])):
|
||||
weight = x[j][1]
|
||||
@ -20,7 +23,7 @@ class ClipTokenWeightEncoder:
|
||||
output += [z]
|
||||
if (len(output) == 0):
|
||||
return self.encode(self.empty_tokens)
|
||||
return torch.cat(output, dim=-2).cpu()
|
||||
return torch.cat(output, dim=-2).cpu(), first_pooled.cpu()
|
||||
|
||||
class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
"""Uses the CLIP transformer encoder for text (from huggingface)"""
|
||||
@ -50,6 +53,8 @@ class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
self.layer = layer
|
||||
self.layer_idx = None
|
||||
self.empty_tokens = [[49406] + [49407] * 76]
|
||||
self.text_projection = None
|
||||
self.layer_norm_hidden_state = True
|
||||
if layer == "hidden":
|
||||
assert layer_idx is not None
|
||||
assert abs(layer_idx) <= 12
|
||||
@ -112,13 +117,20 @@ class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
||||
z = outputs.pooler_output[:, None, :]
|
||||
else:
|
||||
z = outputs.hidden_states[self.layer_idx]
|
||||
z = self.transformer.text_model.final_layer_norm(z)
|
||||
if self.layer_norm_hidden_state:
|
||||
z = self.transformer.text_model.final_layer_norm(z)
|
||||
|
||||
return z
|
||||
pooled_output = outputs.pooler_output
|
||||
if self.text_projection is not None:
|
||||
pooled_output = pooled_output @ self.text_projection
|
||||
return z, pooled_output
|
||||
|
||||
def encode(self, tokens):
|
||||
return self(tokens)
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.transformer.load_state_dict(sd, strict=False)
|
||||
|
||||
def parse_parentheses(string):
|
||||
result = []
|
||||
current_item = ""
|
||||
@ -204,7 +216,7 @@ def expand_directory_list(directories):
|
||||
dirs.add(root)
|
||||
return list(dirs)
|
||||
|
||||
def load_embed(embedding_name, embedding_directory):
|
||||
def load_embed(embedding_name, embedding_directory, embedding_size):
|
||||
if isinstance(embedding_directory, str):
|
||||
embedding_directory = [embedding_directory]
|
||||
|
||||
@ -253,13 +265,23 @@ def load_embed(embedding_name, embedding_directory):
|
||||
if embed_out is None:
|
||||
if 'string_to_param' in embed:
|
||||
values = embed['string_to_param'].values()
|
||||
embed_out = next(iter(values))
|
||||
elif isinstance(embed, list):
|
||||
out_list = []
|
||||
for x in range(len(embed)):
|
||||
for k in embed[x]:
|
||||
t = embed[x][k]
|
||||
if t.shape[-1] != embedding_size:
|
||||
continue
|
||||
out_list.append(t.reshape(-1, t.shape[-1]))
|
||||
embed_out = torch.cat(out_list, dim=0)
|
||||
else:
|
||||
values = embed.values()
|
||||
embed_out = next(iter(values))
|
||||
embed_out = next(iter(values))
|
||||
return embed_out
|
||||
|
||||
class SD1Tokenizer:
|
||||
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None):
|
||||
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768):
|
||||
if tokenizer_path is None:
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
|
||||
self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path)
|
||||
@ -275,17 +297,18 @@ class SD1Tokenizer:
|
||||
self.embedding_directory = embedding_directory
|
||||
self.max_word_length = 8
|
||||
self.embedding_identifier = "embedding:"
|
||||
self.embedding_size = embedding_size
|
||||
|
||||
def _try_get_embedding(self, embedding_name:str):
|
||||
'''
|
||||
Takes a potential embedding name and tries to retrieve it.
|
||||
Returns a Tuple consisting of the embedding and any leftover string, embedding can be None.
|
||||
'''
|
||||
embed = load_embed(embedding_name, self.embedding_directory)
|
||||
embed = load_embed(embedding_name, self.embedding_directory, self.embedding_size)
|
||||
if embed is None:
|
||||
stripped = embedding_name.strip(',')
|
||||
if len(stripped) < len(embedding_name):
|
||||
embed = load_embed(stripped, self.embedding_directory)
|
||||
embed = load_embed(stripped, self.embedding_directory, self.embedding_size)
|
||||
return (embed, embedding_name[len(stripped):])
|
||||
return (embed, "")
|
||||
|
||||
|
||||
@ -31,4 +31,4 @@ class SD2ClipModel(sd1_clip.SD1ClipModel):
|
||||
|
||||
class SD2Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, tokenizer_path=None, embedding_directory=None):
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory)
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024)
|
||||
|
||||
96
comfy/sdxl_clip.py
Normal file
96
comfy/sdxl_clip.py
Normal file
@ -0,0 +1,96 @@
|
||||
from comfy import sd1_clip
|
||||
import torch
|
||||
import os
|
||||
|
||||
class SDXLClipG(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None):
|
||||
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
|
||||
super().__init__(device=device, freeze=freeze, textmodel_json_config=textmodel_json_config)
|
||||
self.empty_tokens = [[49406] + [49407] + [0] * 75]
|
||||
self.text_projection = torch.nn.Parameter(torch.empty(1280, 1280))
|
||||
self.layer_norm_hidden_state = False
|
||||
if layer == "last":
|
||||
pass
|
||||
elif layer == "penultimate":
|
||||
layer_idx = -1
|
||||
self.clip_layer(layer_idx)
|
||||
elif self.layer == "hidden":
|
||||
assert layer_idx is not None
|
||||
assert abs(layer_idx) < 32
|
||||
self.clip_layer(layer_idx)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def clip_layer(self, layer_idx):
|
||||
if layer_idx < 0:
|
||||
layer_idx -= 1 #The real last layer of SD2.x clip is the penultimate one. The last one might contain garbage.
|
||||
if abs(layer_idx) >= 32:
|
||||
self.layer = "hidden"
|
||||
self.layer_idx = -2
|
||||
else:
|
||||
self.layer = "hidden"
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "text_projection" in sd:
|
||||
self.text_projection[:] = sd.pop("text_projection")
|
||||
return super().load_sd(sd)
|
||||
|
||||
class SDXLClipGTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, tokenizer_path=None, embedding_directory=None):
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280)
|
||||
|
||||
|
||||
class SDXLTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None):
|
||||
self.clip_l = sd1_clip.SD1Tokenizer(embedding_directory=embedding_directory)
|
||||
self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)
|
||||
|
||||
def tokenize_with_weights(self, text:str, return_word_ids=False):
|
||||
out = {}
|
||||
out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
|
||||
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
|
||||
return out
|
||||
|
||||
def untokenize(self, token_weight_pair):
|
||||
return self.clip_g.untokenize(token_weight_pair)
|
||||
|
||||
class SDXLClipModel(torch.nn.Module):
|
||||
def __init__(self, device="cpu"):
|
||||
super().__init__()
|
||||
self.clip_l = sd1_clip.SD1ClipModel(layer="hidden", layer_idx=11, device=device)
|
||||
self.clip_l.layer_norm_hidden_state = False
|
||||
self.clip_g = SDXLClipG(device=device)
|
||||
|
||||
def clip_layer(self, layer_idx):
|
||||
self.clip_l.clip_layer(layer_idx)
|
||||
self.clip_g.clip_layer(layer_idx)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
token_weight_pairs_g = token_weight_pairs["g"]
|
||||
token_weight_pairs_l = token_weight_pairs["l"]
|
||||
g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
|
||||
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
|
||||
return torch.cat([l_out, g_out], dim=-1), g_pooled
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
|
||||
return self.clip_g.load_sd(sd)
|
||||
else:
|
||||
return self.clip_l.load_sd(sd)
|
||||
|
||||
class SDXLRefinerClipModel(torch.nn.Module):
|
||||
def __init__(self, device="cpu"):
|
||||
super().__init__()
|
||||
self.clip_g = SDXLClipG(device=device)
|
||||
|
||||
def clip_layer(self, layer_idx):
|
||||
self.clip_g.clip_layer(layer_idx)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
token_weight_pairs_g = token_weight_pairs["g"]
|
||||
g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
|
||||
return g_out, g_pooled
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.clip_g.load_sd(sd)
|
||||
149
comfy/supported_models.py
Normal file
149
comfy/supported_models.py
Normal file
@ -0,0 +1,149 @@
|
||||
import torch
|
||||
from . import model_base
|
||||
from . import utils
|
||||
|
||||
from . import sd1_clip
|
||||
from . import sd2_clip
|
||||
from . import sdxl_clip
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
|
||||
class SD15(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"context_dim": 768,
|
||||
"model_channels": 320,
|
||||
"use_linear_in_transformer": False,
|
||||
"adm_in_channels": None,
|
||||
}
|
||||
|
||||
unet_extra_config = {
|
||||
"num_heads": 8,
|
||||
"num_head_channels": -1,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.SD15
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
k = list(state_dict.keys())
|
||||
for x in k:
|
||||
if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
|
||||
y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
|
||||
state_dict[y] = state_dict.pop(x)
|
||||
|
||||
if 'cond_stage_model.transformer.text_model.embeddings.position_ids' in state_dict:
|
||||
ids = state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids']
|
||||
if ids.dtype == torch.float32:
|
||||
state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round()
|
||||
|
||||
return state_dict
|
||||
|
||||
def clip_target(self):
|
||||
return supported_models_base.ClipTarget(sd1_clip.SD1Tokenizer, sd1_clip.SD1ClipModel)
|
||||
|
||||
class SD20(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"context_dim": 1024,
|
||||
"model_channels": 320,
|
||||
"use_linear_in_transformer": True,
|
||||
"adm_in_channels": None,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.SD15
|
||||
|
||||
def v_prediction(self, state_dict):
|
||||
if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
|
||||
k = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias"
|
||||
out = state_dict[k]
|
||||
if torch.std(out, unbiased=False) > 0.09: # not sure how well this will actually work. I guess we will find out.
|
||||
return True
|
||||
return False
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
state_dict = utils.transformers_convert(state_dict, "cond_stage_model.model.", "cond_stage_model.transformer.text_model.", 24)
|
||||
return state_dict
|
||||
|
||||
def clip_target(self):
|
||||
return supported_models_base.ClipTarget(sd2_clip.SD2Tokenizer, sd2_clip.SD2ClipModel)
|
||||
|
||||
class SD21UnclipL(SD20):
|
||||
unet_config = {
|
||||
"context_dim": 1024,
|
||||
"model_channels": 320,
|
||||
"use_linear_in_transformer": True,
|
||||
"adm_in_channels": 1536,
|
||||
}
|
||||
|
||||
clip_vision_prefix = "embedder.model.visual."
|
||||
noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 768}
|
||||
|
||||
|
||||
class SD21UnclipH(SD20):
|
||||
unet_config = {
|
||||
"context_dim": 1024,
|
||||
"model_channels": 320,
|
||||
"use_linear_in_transformer": True,
|
||||
"adm_in_channels": 2048,
|
||||
}
|
||||
|
||||
clip_vision_prefix = "embedder.model.visual."
|
||||
noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1024}
|
||||
|
||||
class SDXLRefiner(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"model_channels": 384,
|
||||
"use_linear_in_transformer": True,
|
||||
"context_dim": 1280,
|
||||
"adm_in_channels": 2560,
|
||||
"transformer_depth": [0, 4, 4, 0],
|
||||
}
|
||||
|
||||
latent_format = latent_formats.SDXL
|
||||
|
||||
def get_model(self, state_dict):
|
||||
return model_base.SDXLRefiner(self)
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
keys_to_replace = {}
|
||||
replace_prefix = {}
|
||||
|
||||
state_dict = utils.transformers_convert(state_dict, "conditioner.embedders.0.model.", "cond_stage_model.clip_g.transformer.text_model.", 32)
|
||||
keys_to_replace["conditioner.embedders.0.model.text_projection"] = "cond_stage_model.clip_g.text_projection"
|
||||
|
||||
state_dict = supported_models_base.state_dict_key_replace(state_dict, keys_to_replace)
|
||||
return state_dict
|
||||
|
||||
def clip_target(self):
|
||||
return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLRefinerClipModel)
|
||||
|
||||
class SDXL(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"model_channels": 320,
|
||||
"use_linear_in_transformer": True,
|
||||
"transformer_depth": [0, 2, 10],
|
||||
"context_dim": 2048,
|
||||
"adm_in_channels": 2816
|
||||
}
|
||||
|
||||
latent_format = latent_formats.SDXL
|
||||
|
||||
def get_model(self, state_dict):
|
||||
return model_base.SDXL(self)
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
keys_to_replace = {}
|
||||
replace_prefix = {}
|
||||
|
||||
replace_prefix["conditioner.embedders.0.transformer.text_model"] = "cond_stage_model.clip_l.transformer.text_model"
|
||||
state_dict = utils.transformers_convert(state_dict, "conditioner.embedders.1.model.", "cond_stage_model.clip_g.transformer.text_model.", 32)
|
||||
keys_to_replace["conditioner.embedders.1.model.text_projection"] = "cond_stage_model.clip_g.text_projection"
|
||||
|
||||
state_dict = supported_models_base.state_dict_prefix_replace(state_dict, replace_prefix)
|
||||
state_dict = supported_models_base.state_dict_key_replace(state_dict, keys_to_replace)
|
||||
return state_dict
|
||||
|
||||
def clip_target(self):
|
||||
return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLClipModel)
|
||||
|
||||
|
||||
models = [SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL]
|
||||
66
comfy/supported_models_base.py
Normal file
66
comfy/supported_models_base.py
Normal file
@ -0,0 +1,66 @@
|
||||
import torch
|
||||
from . import model_base
|
||||
from . import utils
|
||||
|
||||
|
||||
def state_dict_key_replace(state_dict, keys_to_replace):
|
||||
for x in keys_to_replace:
|
||||
if x in state_dict:
|
||||
state_dict[keys_to_replace[x]] = state_dict.pop(x)
|
||||
return state_dict
|
||||
|
||||
def state_dict_prefix_replace(state_dict, replace_prefix):
|
||||
for rp in replace_prefix:
|
||||
replace = list(map(lambda a: (a, "{}{}".format(replace_prefix[rp], a[len(rp):])), filter(lambda a: a.startswith(rp), state_dict.keys())))
|
||||
for x in replace:
|
||||
state_dict[x[1]] = state_dict.pop(x[0])
|
||||
return state_dict
|
||||
|
||||
|
||||
class ClipTarget:
|
||||
def __init__(self, tokenizer, clip):
|
||||
self.clip = clip
|
||||
self.tokenizer = tokenizer
|
||||
self.params = {}
|
||||
|
||||
class BASE:
|
||||
unet_config = {}
|
||||
unet_extra_config = {
|
||||
"num_heads": -1,
|
||||
"num_head_channels": 64,
|
||||
}
|
||||
|
||||
clip_prefix = []
|
||||
clip_vision_prefix = None
|
||||
noise_aug_config = None
|
||||
|
||||
@classmethod
|
||||
def matches(s, unet_config):
|
||||
for k in s.unet_config:
|
||||
if s.unet_config[k] != unet_config[k]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def v_prediction(self, state_dict):
|
||||
return False
|
||||
|
||||
def inpaint_model(self):
|
||||
return self.unet_config["in_channels"] > 4
|
||||
|
||||
def __init__(self, unet_config):
|
||||
self.unet_config = unet_config
|
||||
self.latent_format = self.latent_format()
|
||||
for x in self.unet_extra_config:
|
||||
self.unet_config[x] = self.unet_extra_config[x]
|
||||
|
||||
def get_model(self, state_dict):
|
||||
if self.inpaint_model():
|
||||
return model_base.SDInpaint(self, v_prediction=self.v_prediction(state_dict))
|
||||
elif self.noise_aug_config is not None:
|
||||
return model_base.SD21UNCLIP(self, self.noise_aug_config, v_prediction=self.v_prediction(state_dict))
|
||||
else:
|
||||
return model_base.BaseModel(self, v_prediction=self.v_prediction(state_dict))
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
return state_dict
|
||||
|
||||
@ -26,10 +26,10 @@ def load_torch_file(ckpt, safe_load=False):
|
||||
|
||||
def transformers_convert(sd, prefix_from, prefix_to, number):
|
||||
keys_to_replace = {
|
||||
"{}.positional_embedding": "{}.embeddings.position_embedding.weight",
|
||||
"{}.token_embedding.weight": "{}.embeddings.token_embedding.weight",
|
||||
"{}.ln_final.weight": "{}.final_layer_norm.weight",
|
||||
"{}.ln_final.bias": "{}.final_layer_norm.bias",
|
||||
"{}positional_embedding": "{}embeddings.position_embedding.weight",
|
||||
"{}token_embedding.weight": "{}embeddings.token_embedding.weight",
|
||||
"{}ln_final.weight": "{}final_layer_norm.weight",
|
||||
"{}ln_final.bias": "{}final_layer_norm.bias",
|
||||
}
|
||||
|
||||
for k in keys_to_replace:
|
||||
@ -48,19 +48,19 @@ def transformers_convert(sd, prefix_from, prefix_to, number):
|
||||
for resblock in range(number):
|
||||
for x in resblock_to_replace:
|
||||
for y in ["weight", "bias"]:
|
||||
k = "{}.transformer.resblocks.{}.{}.{}".format(prefix_from, resblock, x, y)
|
||||
k_to = "{}.encoder.layers.{}.{}.{}".format(prefix_to, resblock, resblock_to_replace[x], y)
|
||||
k = "{}transformer.resblocks.{}.{}.{}".format(prefix_from, resblock, x, y)
|
||||
k_to = "{}encoder.layers.{}.{}.{}".format(prefix_to, resblock, resblock_to_replace[x], y)
|
||||
if k in sd:
|
||||
sd[k_to] = sd.pop(k)
|
||||
|
||||
for y in ["weight", "bias"]:
|
||||
k_from = "{}.transformer.resblocks.{}.attn.in_proj_{}".format(prefix_from, resblock, y)
|
||||
k_from = "{}transformer.resblocks.{}.attn.in_proj_{}".format(prefix_from, resblock, y)
|
||||
if k_from in sd:
|
||||
weights = sd.pop(k_from)
|
||||
shape_from = weights.shape[0] // 3
|
||||
for x in range(3):
|
||||
p = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"]
|
||||
k_to = "{}.encoder.layers.{}.{}.{}".format(prefix_to, resblock, p[x], y)
|
||||
k_to = "{}encoder.layers.{}.{}.{}".format(prefix_to, resblock, p[x], y)
|
||||
sd[k_to] = weights[shape_from*x:shape_from*(x + 1)]
|
||||
return sd
|
||||
|
||||
|
||||
@ -142,3 +142,36 @@ def get_functions(x, ratio, original_shape):
|
||||
|
||||
nothing = lambda y: y
|
||||
return nothing, nothing
|
||||
|
||||
|
||||
|
||||
class TomePatchModel:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "model": ("MODEL",),
|
||||
"ratio": ("FLOAT", {"default": 0.3, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "_for_testing"
|
||||
|
||||
def patch(self, model, ratio):
|
||||
self.u = None
|
||||
def tomesd_m(q, k, v, extra_options):
|
||||
#NOTE: In the reference code get_functions takes x (input of the transformer block) as the argument instead of q
|
||||
#however from my basic testing it seems that using q instead gives better results
|
||||
m, self.u = get_functions(q, ratio, extra_options["original_shape"])
|
||||
return m(q), k, v
|
||||
def tomesd_u(n, extra_options):
|
||||
return self.u(n)
|
||||
|
||||
m = model.clone()
|
||||
m.set_model_attn1_patch(tomesd_m)
|
||||
m.set_model_attn1_output_patch(tomesd_u)
|
||||
return (m, )
|
||||
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"TomePatchModel": TomePatchModel,
|
||||
}
|
||||
@ -49,14 +49,8 @@ class TAESDPreviewerImpl(LatentPreviewer):
|
||||
|
||||
|
||||
class Latent2RGBPreviewer(LatentPreviewer):
|
||||
def __init__(self):
|
||||
self.latent_rgb_factors = torch.tensor([
|
||||
# R G B
|
||||
[0.298, 0.207, 0.208], # L1
|
||||
[0.187, 0.286, 0.173], # L2
|
||||
[-0.158, 0.189, 0.264], # L3
|
||||
[-0.184, -0.271, -0.473], # L4
|
||||
], device="cpu")
|
||||
def __init__(self, latent_rgb_factors):
|
||||
self.latent_rgb_factors = torch.tensor(latent_rgb_factors, device="cpu")
|
||||
|
||||
def decode_latent_to_preview(self, x0):
|
||||
latent_image = x0[0].permute(1, 2, 0).cpu() @ self.latent_rgb_factors
|
||||
@ -69,12 +63,12 @@ class Latent2RGBPreviewer(LatentPreviewer):
|
||||
return Image.fromarray(latents_ubyte.numpy())
|
||||
|
||||
|
||||
def get_previewer(device):
|
||||
def get_previewer(device, latent_format):
|
||||
previewer = None
|
||||
method = args.preview_method
|
||||
if method != LatentPreviewMethod.NoPreviews:
|
||||
# TODO previewer methods
|
||||
taesd_decoder_path = folder_paths.get_full_path("vae_approx", "taesd_decoder.pth")
|
||||
taesd_decoder_path = folder_paths.get_full_path("vae_approx", latent_format.taesd_decoder_name)
|
||||
|
||||
if method == LatentPreviewMethod.Auto:
|
||||
method = LatentPreviewMethod.Latent2RGB
|
||||
@ -86,10 +80,10 @@ def get_previewer(device):
|
||||
taesd = TAESD(None, taesd_decoder_path).to(device)
|
||||
previewer = TAESDPreviewerImpl(taesd)
|
||||
else:
|
||||
print("Warning: TAESD previews enabled, but could not find models/vae_approx/taesd_decoder.pth")
|
||||
print("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name))
|
||||
|
||||
if previewer is None:
|
||||
previewer = Latent2RGBPreviewer()
|
||||
previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors)
|
||||
return previewer
|
||||
|
||||
|
||||
|
||||
55
nodes.py
55
nodes.py
@ -48,7 +48,9 @@ class CLIPTextEncode:
|
||||
CATEGORY = "conditioning"
|
||||
|
||||
def encode(self, clip, text):
|
||||
return ([[clip.encode(text), {}]], )
|
||||
tokens = clip.tokenize(text)
|
||||
cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
|
||||
return ([[cond, {"pooled_output": pooled}]], )
|
||||
|
||||
class ConditioningCombine:
|
||||
@classmethod
|
||||
@ -282,6 +284,7 @@ class SaveLatent:
|
||||
|
||||
output = {}
|
||||
output["latent_tensor"] = samples["samples"]
|
||||
output["latent_format_version_0"] = torch.tensor([])
|
||||
|
||||
safetensors.torch.save_file(output, file, metadata=metadata)
|
||||
|
||||
@ -303,7 +306,10 @@ class LoadLatent:
|
||||
def load(self, latent):
|
||||
latent_path = folder_paths.get_annotated_filepath(latent)
|
||||
latent = safetensors.torch.load_file(latent_path, device="cpu")
|
||||
samples = {"samples": latent["latent_tensor"].float()}
|
||||
multiplier = 1.0
|
||||
if "latent_format_version_0" not in latent:
|
||||
multiplier = 1.0 / 0.18215
|
||||
samples = {"samples": latent["latent_tensor"].float() * multiplier}
|
||||
return (samples, )
|
||||
|
||||
@classmethod
|
||||
@ -431,22 +437,6 @@ class LoraLoader:
|
||||
model_lora, clip_lora = comfy.sd.load_lora_for_models(model, clip, lora_path, strength_model, strength_clip)
|
||||
return (model_lora, clip_lora)
|
||||
|
||||
class TomePatchModel:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "model": ("MODEL",),
|
||||
"ratio": ("FLOAT", {"default": 0.3, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "_for_testing"
|
||||
|
||||
def patch(self, model, ratio):
|
||||
m = model.clone()
|
||||
m.set_model_tomesd(ratio)
|
||||
return (m, )
|
||||
|
||||
class VAELoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -530,11 +520,27 @@ class CLIPLoader:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "loaders"
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
def load_clip(self, clip_name):
|
||||
clip_path = folder_paths.get_full_path("clip", clip_name)
|
||||
clip = comfy.sd.load_clip(ckpt_path=clip_path, embedding_directory=folder_paths.get_folder_paths("embeddings"))
|
||||
clip = comfy.sd.load_clip(ckpt_paths=[clip_path], embedding_directory=folder_paths.get_folder_paths("embeddings"))
|
||||
return (clip,)
|
||||
|
||||
class DualCLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name1": (folder_paths.get_filename_list("clip"), ), "clip_name2": (folder_paths.get_filename_list("clip"), ),
|
||||
}}
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
def load_clip(self, clip_name1, clip_name2):
|
||||
clip_path1 = folder_paths.get_full_path("clip", clip_name1)
|
||||
clip_path2 = folder_paths.get_full_path("clip", clip_name2)
|
||||
clip = comfy.sd.load_clip(ckpt_paths=[clip_path1, clip_path2], embedding_directory=folder_paths.get_folder_paths("embeddings"))
|
||||
return (clip,)
|
||||
|
||||
class CLIPVisionLoader:
|
||||
@ -948,7 +954,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
|
||||
if preview_format not in ["JPEG", "PNG"]:
|
||||
preview_format = "JPEG"
|
||||
|
||||
previewer = latent_preview.get_previewer(device)
|
||||
previewer = latent_preview.get_previewer(device, model.model.latent_format)
|
||||
|
||||
pbar = comfy.utils.ProgressBar(steps)
|
||||
def callback(step, x0, x, total_steps):
|
||||
@ -959,7 +965,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
|
||||
|
||||
samples = comfy.sample.sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image,
|
||||
denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step,
|
||||
force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback)
|
||||
force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, seed=seed)
|
||||
out = latent.copy()
|
||||
out["samples"] = samples
|
||||
return (out, )
|
||||
@ -1325,6 +1331,7 @@ NODE_CLASS_MAPPINGS = {
|
||||
"LatentCrop": LatentCrop,
|
||||
"LoraLoader": LoraLoader,
|
||||
"CLIPLoader": CLIPLoader,
|
||||
"DualCLIPLoader": DualCLIPLoader,
|
||||
"CLIPVisionEncode": CLIPVisionEncode,
|
||||
"StyleModelApply": StyleModelApply,
|
||||
"unCLIPConditioning": unCLIPConditioning,
|
||||
@ -1335,7 +1342,6 @@ NODE_CLASS_MAPPINGS = {
|
||||
"CLIPVisionLoader": CLIPVisionLoader,
|
||||
"VAEDecodeTiled": VAEDecodeTiled,
|
||||
"VAEEncodeTiled": VAEEncodeTiled,
|
||||
"TomePatchModel": TomePatchModel,
|
||||
"unCLIPCheckpointLoader": unCLIPCheckpointLoader,
|
||||
"GLIGENLoader": GLIGENLoader,
|
||||
"GLIGENTextBoxApply": GLIGENTextBoxApply,
|
||||
@ -1344,7 +1350,7 @@ NODE_CLASS_MAPPINGS = {
|
||||
"DiffusersLoader": DiffusersLoader,
|
||||
|
||||
"LoadLatent": LoadLatent,
|
||||
"SaveLatent": SaveLatent
|
||||
"SaveLatent": SaveLatent,
|
||||
}
|
||||
|
||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
@ -1460,4 +1466,5 @@ def init_custom_nodes():
|
||||
load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_mask.py"))
|
||||
load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_rebatch.py"))
|
||||
load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_model_merging.py"))
|
||||
load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_tomesd.py"))
|
||||
load_custom_nodes()
|
||||
|
||||
44
server.py
44
server.py
@ -64,7 +64,7 @@ class PromptServer():
|
||||
def __init__(self, loop):
|
||||
PromptServer.instance = self
|
||||
|
||||
mimetypes.init();
|
||||
mimetypes.init()
|
||||
mimetypes.types_map['.js'] = 'application/javascript; charset=utf-8'
|
||||
self.prompt_queue = None
|
||||
self.loop = loop
|
||||
@ -186,18 +186,43 @@ class PromptServer():
|
||||
post = await request.post()
|
||||
return image_upload(post)
|
||||
|
||||
|
||||
@routes.post("/upload/mask")
|
||||
async def upload_mask(request):
|
||||
post = await request.post()
|
||||
|
||||
def image_save_function(image, post, filepath):
|
||||
original_pil = Image.open(post.get("original_image").file).convert('RGBA')
|
||||
mask_pil = Image.open(image.file).convert('RGBA')
|
||||
original_ref = json.loads(post.get("original_ref"))
|
||||
filename, output_dir = folder_paths.annotated_filepath(original_ref['filename'])
|
||||
|
||||
# alpha copy
|
||||
new_alpha = mask_pil.getchannel('A')
|
||||
original_pil.putalpha(new_alpha)
|
||||
original_pil.save(filepath, compress_level=4)
|
||||
# validation for security: prevent accessing arbitrary path
|
||||
if filename[0] == '/' or '..' in filename:
|
||||
return web.Response(status=400)
|
||||
|
||||
if output_dir is None:
|
||||
type = original_ref.get("type", "output")
|
||||
output_dir = folder_paths.get_directory_by_type(type)
|
||||
|
||||
if output_dir is None:
|
||||
return web.Response(status=400)
|
||||
|
||||
if original_ref.get("subfolder", "") != "":
|
||||
full_output_dir = os.path.join(output_dir, original_ref["subfolder"])
|
||||
if os.path.commonpath((os.path.abspath(full_output_dir), output_dir)) != output_dir:
|
||||
return web.Response(status=403)
|
||||
output_dir = full_output_dir
|
||||
|
||||
file = os.path.join(output_dir, filename)
|
||||
|
||||
if os.path.isfile(file):
|
||||
with Image.open(file) as original_pil:
|
||||
original_pil = original_pil.convert('RGBA')
|
||||
mask_pil = Image.open(image.file).convert('RGBA')
|
||||
|
||||
# alpha copy
|
||||
new_alpha = mask_pil.getchannel('A')
|
||||
original_pil.putalpha(new_alpha)
|
||||
original_pil.save(filepath, compress_level=4)
|
||||
|
||||
return image_upload(post, image_save_function)
|
||||
|
||||
@ -231,9 +256,8 @@ class PromptServer():
|
||||
if 'preview' in request.rel_url.query:
|
||||
with Image.open(file) as img:
|
||||
preview_info = request.rel_url.query['preview'].split(';')
|
||||
|
||||
image_format = preview_info[0]
|
||||
if image_format not in ['webp', 'jpeg']:
|
||||
if image_format not in ['webp', 'jpeg'] or 'a' in request.rel_url.query.get('channel', ''):
|
||||
image_format = 'webp'
|
||||
|
||||
quality = 90
|
||||
@ -241,7 +265,7 @@ class PromptServer():
|
||||
quality = int(preview_info[-1])
|
||||
|
||||
buffer = BytesIO()
|
||||
if image_format in ['jpeg']:
|
||||
if image_format in ['jpeg'] or request.rel_url.query.get('channel', '') == 'rgb':
|
||||
img = img.convert("RGB")
|
||||
img.save(buffer, format=image_format, quality=quality)
|
||||
buffer.seek(0)
|
||||
|
||||
@ -346,7 +346,6 @@ class MaskEditorDialog extends ComfyDialog {
|
||||
|
||||
const rgb_url = new URL(ComfyApp.clipspace.imgs[ComfyApp.clipspace['selectedIndex']].src);
|
||||
rgb_url.searchParams.delete('channel');
|
||||
rgb_url.searchParams.delete('preview');
|
||||
rgb_url.searchParams.set('channel', 'rgb');
|
||||
orig_image.src = rgb_url;
|
||||
this.image = orig_image;
|
||||
@ -618,10 +617,20 @@ class MaskEditorDialog extends ComfyDialog {
|
||||
const dataURL = this.backupCanvas.toDataURL();
|
||||
const blob = dataURLToBlob(dataURL);
|
||||
|
||||
const original_blob = loadedImageToBlob(this.image);
|
||||
let original_url = new URL(this.image.src);
|
||||
|
||||
const original_ref = { filename: original_url.searchParams.get('filename') };
|
||||
|
||||
let original_subfolder = original_url.searchParams.get("subfolder");
|
||||
if(original_subfolder)
|
||||
original_ref.subfolder = original_subfolder;
|
||||
|
||||
let original_type = original_url.searchParams.get("type");
|
||||
if(original_type)
|
||||
original_ref.type = original_type;
|
||||
|
||||
formData.append('image', blob, filename);
|
||||
formData.append('original_image', original_blob);
|
||||
formData.append('original_ref', JSON.stringify(original_ref));
|
||||
formData.append('type', "input");
|
||||
formData.append('subfolder', "clipspace");
|
||||
|
||||
|
||||
@ -159,14 +159,19 @@ export class ComfyApp {
|
||||
const clip_image = ComfyApp.clipspace.images[ComfyApp.clipspace['selectedIndex']];
|
||||
const index = node.widgets.findIndex(obj => obj.name === 'image');
|
||||
if(index >= 0) {
|
||||
node.widgets[index].value = clip_image;
|
||||
if(node.widgets[index].type != 'image' && typeof node.widgets[index].value == "string" && clip_image.filename) {
|
||||
node.widgets[index].value = (clip_image.subfolder?clip_image.subfolder+'/':'') + clip_image.filename + (clip_image.type?` [${clip_image.type}]`:'');
|
||||
}
|
||||
else {
|
||||
node.widgets[index].value = clip_image;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(ComfyApp.clipspace.widgets) {
|
||||
ComfyApp.clipspace.widgets.forEach(({ type, name, value }) => {
|
||||
const prop = Object.values(node.widgets).find(obj => obj.type === type && obj.name === name);
|
||||
if (prop && prop.type != 'image') {
|
||||
if(typeof prop.value == "string" && value.filename) {
|
||||
if (prop && prop.type != 'button') {
|
||||
if(prop.type != 'image' && typeof prop.value == "string" && value.filename) {
|
||||
prop.value = (value.subfolder?value.subfolder+'/':'') + value.filename + (value.type?` [${value.type}]`:'');
|
||||
}
|
||||
else {
|
||||
@ -174,10 +179,6 @@ export class ComfyApp {
|
||||
prop.callback(value);
|
||||
}
|
||||
}
|
||||
else if (prop && prop.type != 'button') {
|
||||
prop.value = value;
|
||||
prop.callback(value);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user