Merge branch 'Main' into feature/blockweights

2026-07-15 02:49:18 +08:00 · 2023-04-03 09:12:13 +09:00 · 2023-04-03 09:12:13 +09:00 · d8147b6635
commit d8147b6635
parent c56308f152 72f9235a49
29 changed files with 1174 additions and 208 deletions
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -0,0 +1,62 @@
+from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor
+from .utils import load_torch_file, transformers_convert
+import os
+
+class ClipVisionModel():
+    def __init__(self, json_config):
+        config = CLIPVisionConfig.from_json_file(json_config)
+        self.model = CLIPVisionModelWithProjection(config)
+        self.processor = CLIPImageProcessor(crop_size=224,
+                                            do_center_crop=True,
+                                            do_convert_rgb=True,
+                                            do_normalize=True,
+                                            do_resize=True,
+                                            image_mean=[ 0.48145466,0.4578275,0.40821073],
+                                            image_std=[0.26862954,0.26130258,0.27577711],
+                                            resample=3, #bicubic
+                                            size=224)
+
+    def load_sd(self, sd):
+        self.model.load_state_dict(sd, strict=False)
+
+    def encode_image(self, image):
+        inputs = self.processor(images=[image[0]], return_tensors="pt")
+        outputs = self.model(**inputs)
+        return outputs
+
+def convert_to_transformers(sd):
+    sd_k = sd.keys()
+    if "embedder.model.visual.transformer.resblocks.0.attn.in_proj_weight" in sd_k:
+        keys_to_replace = {
+            "embedder.model.visual.class_embedding": "vision_model.embeddings.class_embedding",
+            "embedder.model.visual.conv1.weight": "vision_model.embeddings.patch_embedding.weight",
+            "embedder.model.visual.positional_embedding": "vision_model.embeddings.position_embedding.weight",
+            "embedder.model.visual.ln_post.bias": "vision_model.post_layernorm.bias",
+            "embedder.model.visual.ln_post.weight": "vision_model.post_layernorm.weight",
+            "embedder.model.visual.ln_pre.bias": "vision_model.pre_layrnorm.bias",
+            "embedder.model.visual.ln_pre.weight": "vision_model.pre_layrnorm.weight",
+        }
+
+        for x in keys_to_replace:
+            if x in sd_k:
+                sd[keys_to_replace[x]] = sd.pop(x)
+
+        if "embedder.model.visual.proj" in sd_k:
+            sd['visual_projection.weight'] = sd.pop("embedder.model.visual.proj").transpose(0, 1)
+
+        sd = transformers_convert(sd, "embedder.model.visual", "vision_model", 32)
+    return sd
+
+def load_clipvision_from_sd(sd):
+    sd = convert_to_transformers(sd)
+    if "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
+        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
+    else:
+        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+    clip = ClipVisionModel(json_config)
+    clip.load_sd(sd)
+    return clip
+
+def load(ckpt_path):
+    sd = load_torch_file(ckpt_path)
+    return load_clipvision_from_sd(sd)
--- a/comfy/clip_vision_config_h.json
+++ b/comfy/clip_vision_config_h.json
@ -0,0 +1,18 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float32"
+}
--- a/comfy_extras/clip_vision_config.json
+++ b/comfy_extras/clip_vision_config.json
@ -1,8 +1,4 @@
 {
-  "_name_or_path": "openai/clip-vit-large-patch14",
-  "architectures": [
-    "CLIPVisionModel"
-  ],
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
@ -18,6 +14,5 @@
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
-  "torch_dtype": "float32",
-  "transformers_version": "4.24.0"
+  "torch_dtype": "float32"
 }
--- a/comfy/ldm/models/diffusion/ddim.py
+++ b/comfy/ldm/models/diffusion/ddim.py
@ -78,7 +78,7 @@ class DDIMSampler(object):
                      dynamic_threshold=None,
                      ucg_schedule=None,
                      denoise_function=None,
-                      cond_concat=None,
+                      extra_args=None,
                      to_zero=True,
                      end_step=None,
                      **kwargs
@ -101,7 +101,7 @@ class DDIMSampler(object):
                                                    dynamic_threshold=dynamic_threshold,
                                                    ucg_schedule=ucg_schedule,
                                                    denoise_function=denoise_function,
-                                                    cond_concat=cond_concat,
+                                                    extra_args=extra_args,
                                                    to_zero=to_zero,
                                                    end_step=end_step
                                                    )
@ -174,7 +174,7 @@ class DDIMSampler(object):
                                                    dynamic_threshold=dynamic_threshold,
                                                    ucg_schedule=ucg_schedule,
                                                    denoise_function=None,
-                                                    cond_concat=None
+                                                    extra_args=None
                                                    )
        return samples, intermediates

@ -185,7 +185,7 @@ class DDIMSampler(object):
                      mask=None, x0=None, img_callback=None, log_every_t=100,
                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
                      unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
-                      ucg_schedule=None, denoise_function=None, cond_concat=None, to_zero=True, end_step=None):
+                      ucg_schedule=None, denoise_function=None, extra_args=None, to_zero=True, end_step=None):
        device = self.model.betas.device
        b = shape[0]
        if x_T is None:
@ -225,7 +225,7 @@ class DDIMSampler(object):
                                      corrector_kwargs=corrector_kwargs,
                                      unconditional_guidance_scale=unconditional_guidance_scale,
                                      unconditional_conditioning=unconditional_conditioning,
-                                      dynamic_threshold=dynamic_threshold, denoise_function=denoise_function, cond_concat=cond_concat)
+                                      dynamic_threshold=dynamic_threshold, denoise_function=denoise_function, extra_args=extra_args)
            img, pred_x0 = outs
            if callback: callback(i)
            if img_callback: img_callback(pred_x0, i)
@ -249,11 +249,11 @@ class DDIMSampler(object):
    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
                      unconditional_guidance_scale=1., unconditional_conditioning=None,
-                      dynamic_threshold=None, denoise_function=None, cond_concat=None):
+                      dynamic_threshold=None, denoise_function=None, extra_args=None):
        b, *_, device = *x.shape, x.device

        if denoise_function is not None:
-            model_output = denoise_function(self.model.apply_model, x, t, unconditional_conditioning, c, unconditional_guidance_scale, cond_concat)
+            model_output = denoise_function(self.model.apply_model, x, t, **extra_args)
        elif unconditional_conditioning is None or unconditional_guidance_scale == 1.:
            model_output = self.model.apply_model(x, t, c)
        else:
--- a/comfy/ldm/models/diffusion/ddpm.py
+++ b/comfy/ldm/models/diffusion/ddpm.py
@ -1317,12 +1317,12 @@ class DiffusionWrapper(torch.nn.Module):
        self.conditioning_key = conditioning_key
        assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']

-    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None, control=None):
+    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None, control=None, transformer_options={}):
        if self.conditioning_key is None:
-            out = self.diffusion_model(x, t, control=control)
+            out = self.diffusion_model(x, t, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'concat':
            xc = torch.cat([x] + c_concat, dim=1)
-            out = self.diffusion_model(xc, t, control=control)
+            out = self.diffusion_model(xc, t, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'crossattn':
            if not self.sequential_cross_attn:
                cc = torch.cat(c_crossattn, 1)
@ -1332,25 +1332,25 @@ class DiffusionWrapper(torch.nn.Module):
                # TorchScript changes names of the arguments
                # with argument cc defined as context=cc scripted model will produce
                # an error: RuntimeError: forward() is missing value for argument 'argument_3'.
-                out = self.scripted_diffusion_model(x, t, cc, control=control)
+                out = self.scripted_diffusion_model(x, t, cc, control=control, transformer_options=transformer_options)
            else:
-                out = self.diffusion_model(x, t, context=cc, control=control)
+                out = self.diffusion_model(x, t, context=cc, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'hybrid':
            xc = torch.cat([x] + c_concat, dim=1)
            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, control=control)
+            out = self.diffusion_model(xc, t, context=cc, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'hybrid-adm':
            assert c_adm is not None
            xc = torch.cat([x] + c_concat, dim=1)
            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(xc, t, context=cc, y=c_adm, control=control)
+            out = self.diffusion_model(xc, t, context=cc, y=c_adm, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'crossattn-adm':
            assert c_adm is not None
            cc = torch.cat(c_crossattn, 1)
-            out = self.diffusion_model(x, t, context=cc, y=c_adm, control=control)
+            out = self.diffusion_model(x, t, context=cc, y=c_adm, control=control, transformer_options=transformer_options)
        elif self.conditioning_key == 'adm':
            cc = c_crossattn[0]
-            out = self.diffusion_model(x, t, y=cc, control=control)
+            out = self.diffusion_model(x, t, y=cc, control=control, transformer_options=transformer_options)
        else:
            raise NotImplementedError()

@ -1801,3 +1801,75 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
        log = super().log_images(*args, **kwargs)
        log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
        return log
+
+
+class ImageEmbeddingConditionedLatentDiffusion(LatentDiffusion):
+    def __init__(self, embedder_config=None, embedding_key="jpg", embedding_dropout=0.5,
+                 freeze_embedder=True, noise_aug_config=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.embed_key = embedding_key
+        self.embedding_dropout = embedding_dropout
+        # self._init_embedder(embedder_config, freeze_embedder)
+        self._init_noise_aug(noise_aug_config)
+
+    def _init_embedder(self, config, freeze=True):
+        embedder = instantiate_from_config(config)
+        if freeze:
+            self.embedder = embedder.eval()
+            self.embedder.train = disabled_train
+            for param in self.embedder.parameters():
+                param.requires_grad = False
+
+    def _init_noise_aug(self, config):
+        if config is not None:
+            # use the KARLO schedule for noise augmentation on CLIP image embeddings
+            noise_augmentor = instantiate_from_config(config)
+            assert isinstance(noise_augmentor, nn.Module)
+            noise_augmentor = noise_augmentor.eval()
+            noise_augmentor.train = disabled_train
+            self.noise_augmentor = noise_augmentor
+        else:
+            self.noise_augmentor = None
+
+    def get_input(self, batch, k, cond_key=None, bs=None, **kwargs):
+        outputs = LatentDiffusion.get_input(self, batch, k, bs=bs, **kwargs)
+        z, c = outputs[0], outputs[1]
+        img = batch[self.embed_key][:bs]
+        img = rearrange(img, 'b h w c -> b c h w')
+        c_adm = self.embedder(img)
+        if self.noise_augmentor is not None:
+            c_adm, noise_level_emb = self.noise_augmentor(c_adm)
+            # assume this gives embeddings of noise levels
+            c_adm = torch.cat((c_adm, noise_level_emb), 1)
+        if self.training:
+            c_adm = torch.bernoulli((1. - self.embedding_dropout) * torch.ones(c_adm.shape[0],
+                                                                               device=c_adm.device)[:, None]) * c_adm
+        all_conds = {"c_crossattn": [c], "c_adm": c_adm}
+        noutputs = [z, all_conds]
+        noutputs.extend(outputs[2:])
+        return noutputs
+
+    @torch.no_grad()
+    def log_images(self, batch, N=8, n_row=4, **kwargs):
+        log = dict()
+        z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True,
+                                           return_original_cond=True)
+        log["inputs"] = x
+        log["reconstruction"] = xrec
+        assert self.model.conditioning_key is not None
+        assert self.cond_stage_key in ["caption", "txt"]
+        xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
+        log["conditioning"] = xc
+        uc = self.get_unconditional_conditioning(N, kwargs.get('unconditional_guidance_label', ''))
+        unconditional_guidance_scale = kwargs.get('unconditional_guidance_scale', 5.)
+
+        uc_ = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
+        ema_scope = self.ema_scope if kwargs.get('use_ema_scope', True) else nullcontext
+        with ema_scope(f"Sampling"):
+            samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=True,
+                                             ddim_steps=kwargs.get('ddim_steps', 50), eta=kwargs.get('ddim_eta', 0.),
+                                             unconditional_guidance_scale=unconditional_guidance_scale,
+                                             unconditional_conditioning=uc_, )
+            x_samples_cfg = self.decode_first_stage(samples_cfg)
+            log[f"samplescfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        return log
--- a/comfy/ldm/models/diffusion/dpm_solver/dpm_solver.py
+++ b/comfy/ldm/models/diffusion/dpm_solver/dpm_solver.py
@ -307,7 +307,16 @@ def model_wrapper(
            else:
                x_in = torch.cat([x] * 2)
                t_in = torch.cat([t_continuous] * 2)
-                c_in = torch.cat([unconditional_condition, condition])
+                if isinstance(condition, dict):
+                    assert isinstance(unconditional_condition, dict)
+                    c_in = dict()
+                    for k in condition:
+                        if isinstance(condition[k], list):
+                            c_in[k] = [torch.cat([unconditional_condition[k][i], condition[k][i]]) for i in range(len(condition[k]))]
+                        else:
+                            c_in[k] = torch.cat([unconditional_condition[k], condition[k]])
+                else:
+                    c_in = torch.cat([unconditional_condition, condition])
                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
                return noise_uncond + guidance_scale * (noise - noise_uncond)

--- a/comfy/ldm/models/diffusion/dpm_solver/sampler.py
+++ b/comfy/ldm/models/diffusion/dpm_solver/sampler.py
@ -3,7 +3,6 @@ import torch

 from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver

-
 MODEL_TYPES = {
    "eps": "noise",
    "v": "v"
@ -51,12 +50,20 @@ class DPMSolverSampler(object):
               ):
        if conditioning is not None:
            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list): ctmp = ctmp[0]
+                if isinstance(ctmp, torch.Tensor):
+                    cbs = ctmp.shape[0]
+                    if cbs != batch_size:
+                        print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            elif isinstance(conditioning, list):
+                for ctmp in conditioning:
+                    if ctmp.shape[0] != batch_size:
+                        print(f"Warning: Got {ctmp.shape[0]} conditionings but batch-size is {batch_size}")
            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+                if isinstance(conditioning, torch.Tensor):
+                    if conditioning.shape[0] != batch_size:
+                        print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")

        # sampling
        C, H, W = shape
@ -83,6 +90,7 @@ class DPMSolverSampler(object):
        )

        dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
-        x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True)
+        x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2,
+                              lower_order_final=True)

-        return x.to(device), None
+        return x.to(device), None
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -11,6 +11,7 @@ from .sub_quadratic_attention import efficient_dot_product_attention

 import model_management

+from . import tomesd

 if model_management.xformers_enabled():
    import xformers
@ -504,12 +505,22 @@ class BasicTransformerBlock(nn.Module):
        self.norm3 = nn.LayerNorm(dim)
        self.checkpoint = checkpoint

-    def forward(self, x, context=None):
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+    def forward(self, x, context=None, transformer_options={}):
+        return checkpoint(self._forward, (x, context, transformer_options), self.parameters(), self.checkpoint)

-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
-        x = self.attn2(self.norm2(x), context=context) + x
+    def _forward(self, x, context=None, transformer_options={}):
+        n = self.norm1(x)
+        if "tomesd" in transformer_options:
+            m, u = tomesd.get_functions(x, transformer_options["tomesd"]["ratio"], transformer_options["original_shape"])
+            n = u(self.attn1(m(n), context=context if self.disable_self_attn else None))
+        else:
+            n = self.attn1(n, context=context if self.disable_self_attn else None)
+
+        x += n
+        n = self.norm2(x)
+        n = self.attn2(n, context=context)
+
+        x += n
        x = self.ff(self.norm3(x)) + x
        return x

@ -557,7 +568,7 @@ class SpatialTransformer(nn.Module):
            self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
        self.use_linear = use_linear

-    def forward(self, x, context=None):
+    def forward(self, x, context=None, transformer_options={}):
        # note: if no context is given, cross-attention defaults to self-attention
        if not isinstance(context, list):
            context = [context]
@ -570,7 +581,7 @@ class SpatialTransformer(nn.Module):
        if self.use_linear:
            x = self.proj_in(x)
        for i, block in enumerate(self.transformer_blocks):
-            x = block(x, context=context[i])
+            x = block(x, context=context[i], transformer_options=transformer_options)
        if self.use_linear:
            x = self.proj_out(x)
        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@ -76,12 +76,12 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
    support it as an extra input.
    """

-    def forward(self, x, emb, context=None):
+    def forward(self, x, emb, context=None, transformer_options={}):
        for layer in self:
            if isinstance(layer, TimestepBlock):
                x = layer(x, emb)
            elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context)
+                x = layer(x, context, transformer_options)
            else:
                x = layer(x)
        return x
@ -409,6 +409,15 @@ class QKVAttention(nn.Module):
        return count_flops_attn(model, _x, y)


+class Timestep(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, t):
+        return timestep_embedding(t, self.dim)
+
+
 class UNetModel(nn.Module):
    """
    The full UNet model with attention and timestep embedding.
@ -470,6 +479,7 @@ class UNetModel(nn.Module):
        num_attention_blocks=None,
        disable_middle_self_attn=False,
        use_linear_in_transformer=False,
+        adm_in_channels=None,
    ):
        super().__init__()
        if use_spatial_transformer:
@ -538,6 +548,15 @@ class UNetModel(nn.Module):
            elif self.num_classes == "continuous":
                print("setting up linear c_adm embedding layer")
                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim),
+                    )
+                )
            else:
                raise ValueError()

@ -753,7 +772,7 @@ class UNetModel(nn.Module):
        self.middle_block.apply(convert_module_to_f32)
        self.output_blocks.apply(convert_module_to_f32)

-    def forward(self, x, timesteps=None, context=None, y=None, control=None, **kwargs):
+    def forward(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs):
        """
        Apply the model to an input batch.
        :param x: an [N x C x ...] Tensor of inputs.
@ -762,6 +781,7 @@ class UNetModel(nn.Module):
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        """
+        transformer_options["original_shape"] = list(x.shape)
        assert (y is not None) == (
            self.num_classes is not None
        ), "must specify y if and only if the model is class-conditional"
@ -775,13 +795,13 @@ class UNetModel(nn.Module):

        h = x.type(self.dtype)
        for id, module in enumerate(self.input_blocks):
-            h = module(h, emb, context)
+            h = module(h, emb, context, transformer_options)
            if control is not None and 'input' in control and len(control['input']) > 0:
                ctrl = control['input'].pop()
                if ctrl is not None:
                    h += ctrl
            hs.append(h)
-        h = self.middle_block(h, emb, context)
+        h = self.middle_block(h, emb, context, transformer_options)
        if control is not None and 'middle' in control and len(control['middle']) > 0:
            h += control['middle'].pop()

@ -793,7 +813,7 @@ class UNetModel(nn.Module):
                    hsp += ctrl
            h = th.cat([h, hsp], dim=1)
            del hsp
-            h = module(h, emb, context)
+            h = module(h, emb, context, transformer_options)
        h = h.type(x.dtype)
        if self.predict_codebook_ids:
            return self.id_predictor(h)
--- a/comfy/ldm/modules/diffusionmodules/util.py
+++ b/comfy/ldm/modules/diffusionmodules/util.py
@ -34,6 +34,13 @@ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2,
        betas = 1 - alphas[1:] / alphas[:-1]
        betas = np.clip(betas, a_min=0, a_max=0.999)

+    elif schedule == "squaredcos_cap_v2":  # used for karlo prior
+        # return early
+        return betas_for_alpha_bar(
+            n_timestep,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+
    elif schedule == "sqrt_linear":
        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
    elif schedule == "sqrt":
@ -218,6 +225,7 @@ class GroupNorm32(nn.GroupNorm):
    def forward(self, x):
        return super().forward(x.float()).type(x.dtype)

+
 def conv_nd(dims, *args, **kwargs):
    """
    Create a 1D, 2D, or 3D convolution module.
@ -267,4 +275,4 @@ class HybridConditioner(nn.Module):
 def noise_like(shape, device, repeat=False):
    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
+    return repeat_noise() if repeat else noise()
--- a/comfy/ldm/modules/encoders/kornia_functions.py
+++ b/comfy/ldm/modules/encoders/kornia_functions.py
@ -0,0 +1,59 @@
+
+
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+#from: https://github.com/kornia/kornia/blob/master/kornia/enhance/normalize.py
+
+def enhance_normalize(data: torch.Tensor, mean: torch.Tensor, std: torch.Tensor) -> torch.Tensor:
+    r"""Normalize an image/video tensor with mean and standard deviation.
+    .. math::
+        \text{input[channel] = (input[channel] - mean[channel]) / std[channel]}
+    Where `mean` is :math:`(M_1, ..., M_n)` and `std` :math:`(S_1, ..., S_n)` for `n` channels,
+    Args:
+        data: Image tensor of size :math:`(B, C, *)`.
+        mean: Mean for each channel.
+        std: Standard deviations for each channel.
+    Return:
+        Normalised tensor with same size as input :math:`(B, C, *)`.
+    Examples:
+        >>> x = torch.rand(1, 4, 3, 3)
+        >>> out = normalize(x, torch.tensor([0.0]), torch.tensor([255.]))
+        >>> out.shape
+        torch.Size([1, 4, 3, 3])
+        >>> x = torch.rand(1, 4, 3, 3)
+        >>> mean = torch.zeros(4)
+        >>> std = 255. * torch.ones(4)
+        >>> out = normalize(x, mean, std)
+        >>> out.shape
+        torch.Size([1, 4, 3, 3])
+    """
+    shape = data.shape
+    if len(mean.shape) == 0 or mean.shape[0] == 1:
+        mean = mean.expand(shape[1])
+    if len(std.shape) == 0 or std.shape[0] == 1:
+        std = std.expand(shape[1])
+
+    # Allow broadcast on channel dimension
+    if mean.shape and mean.shape[0] != 1:
+        if mean.shape[0] != data.shape[1] and mean.shape[:2] != data.shape[:2]:
+            raise ValueError(f"mean length and number of channels do not match. Got {mean.shape} and {data.shape}.")
+
+    # Allow broadcast on channel dimension
+    if std.shape and std.shape[0] != 1:
+        if std.shape[0] != data.shape[1] and std.shape[:2] != data.shape[:2]:
+            raise ValueError(f"std length and number of channels do not match. Got {std.shape} and {data.shape}.")
+
+    mean = torch.as_tensor(mean, device=data.device, dtype=data.dtype)
+    std = torch.as_tensor(std, device=data.device, dtype=data.dtype)
+
+    if mean.shape:
+        mean = mean[..., :, None]
+    if std.shape:
+        std = std[..., :, None]
+
+    out: torch.Tensor = (data.view(shape[0], shape[1], -1) - mean) / std
+
+    return out.view(shape)
--- a/comfy/ldm/modules/encoders/modules.py
+++ b/comfy/ldm/modules/encoders/modules.py
@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from . import kornia_functions
 from torch.utils.checkpoint import checkpoint

 from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
@ -37,7 +38,7 @@ class ClassEmbedder(nn.Module):
        c = batch[key][:, None]
        if self.ucg_rate > 0. and not disable_dropout:
            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
-            c = mask * c + (1-mask) * torch.ones_like(c)*(self.n_classes-1)
+            c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1)
            c = c.long()
        c = self.embedding(c)
        return c
@ -57,18 +58,20 @@ def disabled_train(self, mode=True):

 class FrozenT5Embedder(AbstractEncoder):
    """Uses the T5 transformer encoder for text"""
-    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+
+    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77,
+                 freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
        super().__init__()
        self.tokenizer = T5Tokenizer.from_pretrained(version)
        self.transformer = T5EncoderModel.from_pretrained(version)
        self.device = device
-        self.max_length = max_length   # TODO: typical value?
+        self.max_length = max_length  # TODO: typical value?
        if freeze:
            self.freeze()

    def freeze(self):
        self.transformer = self.transformer.eval()
-        #self.train = disabled_train
+        # self.train = disabled_train
        for param in self.parameters():
            param.requires_grad = False

@ -92,6 +95,7 @@ class FrozenCLIPEmbedder(AbstractEncoder):
        "pooled",
        "hidden"
    ]
+
    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
                 freeze=True, layer="last", layer_idx=None):  # clip-vit-base-patch32
        super().__init__()
@ -110,7 +114,7 @@ class FrozenCLIPEmbedder(AbstractEncoder):

    def freeze(self):
        self.transformer = self.transformer.eval()
-        #self.train = disabled_train
+        # self.train = disabled_train
        for param in self.parameters():
            param.requires_grad = False

@ -118,7 +122,7 @@ class FrozenCLIPEmbedder(AbstractEncoder):
        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
        tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer == "hidden")
        if self.layer == "last":
            z = outputs.last_hidden_state
        elif self.layer == "pooled":
@ -131,15 +135,55 @@ class FrozenCLIPEmbedder(AbstractEncoder):
        return self(text)


+class ClipImageEmbedder(nn.Module):
+    def __init__(
+            self,
+            model,
+            jit=False,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            antialias=True,
+            ucg_rate=0.
+    ):
+        super().__init__()
+        from clip import load as load_clip
+        self.model, _ = load_clip(name=model, device=device, jit=jit)
+
+        self.antialias = antialias
+
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        # x = kornia_functions.geometry_resize(x, (224, 224),
+        #                            interpolation='bicubic', align_corners=True,
+        #                            antialias=self.antialias)
+        x = torch.nn.functional.interpolate(x, size=(224, 224), mode='bicubic', align_corners=True, antialias=True)
+        x = (x + 1.) / 2.
+        # re-normalize according to clip
+        x = kornia_functions.enhance_normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x, no_dropout=False):
+        # x is assumed to be in range [-1,1]
+        out = self.model.encode_image(self.preprocess(x))
+        out = out.to(x.dtype)
+        if self.ucg_rate > 0. and not no_dropout:
+            out = torch.bernoulli((1. - self.ucg_rate) * torch.ones(out.shape[0], device=out.device))[:, None] * out
+        return out
+
+
 class FrozenOpenCLIPEmbedder(AbstractEncoder):
    """
    Uses the OpenCLIP transformer encoder for text
    """
    LAYERS = [
-        #"pooled",
+        # "pooled",
        "last",
        "penultimate"
    ]
+
    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
                 freeze=True, layer="last"):
        super().__init__()
@ -179,7 +223,7 @@ class FrozenOpenCLIPEmbedder(AbstractEncoder):
        x = self.model.ln_final(x)
        return x

-    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
        for i, r in enumerate(self.model.transformer.resblocks):
            if i == len(self.model.transformer.resblocks) - self.layer_idx:
                break
@ -193,14 +237,73 @@ class FrozenOpenCLIPEmbedder(AbstractEncoder):
        return self(text)


+class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="pooled", antialias=True, ucg_rate=0.):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=version, )
+        del model.transformer
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "penultimate":
+            raise NotImplementedError()
+            self.layer_idx = 1
+
+        self.antialias = antialias
+
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        # x = kornia.geometry.resize(x, (224, 224),
+        #                            interpolation='bicubic', align_corners=True,
+        #                            antialias=self.antialias)
+        x = torch.nn.functional.interpolate(x, size=(224, 224), mode='bicubic', align_corners=True, antialias=True)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia_functions.enhance_normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        if self.ucg_rate > 0. and not no_dropout:
+            z = torch.bernoulli((1. - self.ucg_rate) * torch.ones(z.shape[0], device=z.device))[:, None] * z
+        return z
+
+    def encode_with_vision_transformer(self, img):
+        img = self.preprocess(img)
+        x = self.model.visual(img)
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
 class FrozenCLIPT5Encoder(AbstractEncoder):
    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
                 clip_max_length=77, t5_max_length=77):
        super().__init__()
        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
-        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, "
-              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.")
+        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
+              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params.")

    def encode(self, text):
        return self(text)
@ -209,5 +312,3 @@ class FrozenCLIPT5Encoder(AbstractEncoder):
        clip_z = self.clip_encoder.encode(text)
        t5_z = self.t5_encoder.encode(text)
        return [clip_z, t5_z]
-
-
--- a/comfy/ldm/modules/encoders/noise_aug_modules.py
+++ b/comfy/ldm/modules/encoders/noise_aug_modules.py
@ -0,0 +1,35 @@
+from ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
+from ldm.modules.diffusionmodules.openaimodel import Timestep
+import torch
+
+class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation):
+    def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs):
+        super().__init__(*args, **kwargs)
+        if clip_stats_path is None:
+            clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
+        else:
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu")
+        self.register_buffer("data_mean", clip_mean[None, :], persistent=False)
+        self.register_buffer("data_std", clip_std[None, :], persistent=False)
+        self.time_embed = Timestep(timestep_dim)
+
+    def scale(self, x):
+        # re-normalize to centered mean and unit variance
+        x = (x - self.data_mean) * 1. / self.data_std
+        return x
+
+    def unscale(self, x):
+        # back to original data stats
+        x = (x * self.data_std) + self.data_mean
+        return x
+
+    def forward(self, x, noise_level=None):
+        if noise_level is None:
+            noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
+        else:
+            assert isinstance(noise_level, torch.Tensor)
+        x = self.scale(x)
+        z = self.q_sample(x, noise_level)
+        z = self.unscale(z)
+        noise_level = self.time_embed(noise_level)
+        return z, noise_level
--- a/comfy/ldm/modules/tomesd.py
+++ b/comfy/ldm/modules/tomesd.py
@ -0,0 +1,117 @@
+
+
+import torch
+from typing import Tuple, Callable
+import math
+
+def do_nothing(x: torch.Tensor, mode:str=None):
+    return x
+
+
+def bipartite_soft_matching_random2d(metric: torch.Tensor,
+                                     w: int, h: int, sx: int, sy: int, r: int,
+                                     no_rand: bool = False) -> Tuple[Callable, Callable]:
+    """
+    Partitions the tokens into src and dst and merges r tokens from src to dst.
+    Dst tokens are partitioned by choosing one randomy in each (sx, sy) region.
+
+    Args:
+     - metric [B, N, C]: metric to use for similarity
+     - w: image width in tokens
+     - h: image height in tokens
+     - sx: stride in the x dimension for dst, must divide w
+     - sy: stride in the y dimension for dst, must divide h
+     - r: number of tokens to remove (by merging)
+     - no_rand: if true, disable randomness (use top left corner only)
+    """
+    B, N, _ = metric.shape
+
+    if r <= 0:
+        return do_nothing, do_nothing
+    
+    with torch.no_grad():
+        
+        hsy, wsx = h // sy, w // sx
+
+        # For each sy by sx kernel, randomly assign one token to be dst and the rest src
+        idx_buffer = torch.zeros(1, hsy, wsx, sy*sx, 1, device=metric.device)
+
+        if no_rand:
+            rand_idx = torch.zeros(1, hsy, wsx, 1, 1, device=metric.device, dtype=torch.int64)
+        else:
+            rand_idx = torch.randint(sy*sx, size=(1, hsy, wsx, 1, 1), device=metric.device)
+        
+        idx_buffer.scatter_(dim=3, index=rand_idx, src=-torch.ones_like(rand_idx, dtype=idx_buffer.dtype))
+        idx_buffer = idx_buffer.view(1, hsy, wsx, sy, sx, 1).transpose(2, 3).reshape(1, N, 1)
+        rand_idx   = idx_buffer.argsort(dim=1)
+
+        num_dst = int((1 / (sx*sy)) * N)
+        a_idx = rand_idx[:, num_dst:, :] # src
+        b_idx = rand_idx[:, :num_dst, :] # dst
+
+        def split(x):
+            C = x.shape[-1]
+            src = x.gather(dim=1, index=a_idx.expand(B, N - num_dst, C))
+            dst = x.gather(dim=1, index=b_idx.expand(B, num_dst, C))
+            return src, dst
+
+        metric = metric / metric.norm(dim=-1, keepdim=True)
+        a, b = split(metric)
+        scores = a @ b.transpose(-1, -2)
+
+        # Can't reduce more than the # tokens in src
+        r = min(a.shape[1], r)
+
+        node_max, node_idx = scores.max(dim=-1)
+        edge_idx = node_max.argsort(dim=-1, descending=True)[..., None]
+
+        unm_idx = edge_idx[..., r:, :]  # Unmerged Tokens
+        src_idx = edge_idx[..., :r, :]  # Merged Tokens
+        dst_idx = node_idx[..., None].gather(dim=-2, index=src_idx)
+
+    def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
+        src, dst = split(x)
+        n, t1, c = src.shape
+        
+        unm = src.gather(dim=-2, index=unm_idx.expand(n, t1 - r, c))
+        src = src.gather(dim=-2, index=src_idx.expand(n, r, c))
+        dst = dst.scatter_reduce(-2, dst_idx.expand(n, r, c), src, reduce=mode)
+
+        return torch.cat([unm, dst], dim=1)
+
+    def unmerge(x: torch.Tensor) -> torch.Tensor:
+        unm_len = unm_idx.shape[1]
+        unm, dst = x[..., :unm_len, :], x[..., unm_len:, :]
+        _, _, c = unm.shape
+
+        src = dst.gather(dim=-2, index=dst_idx.expand(B, r, c))
+
+        # Combine back to the original shape
+        out = torch.zeros(B, N, c, device=x.device, dtype=x.dtype)
+        out.scatter_(dim=-2, index=b_idx.expand(B, num_dst, c), src=dst)
+        out.scatter_(dim=-2, index=a_idx.expand(B, a_idx.shape[1], 1).gather(dim=1, index=unm_idx).expand(B, unm_len, c), src=unm)
+        out.scatter_(dim=-2, index=a_idx.expand(B, a_idx.shape[1], 1).gather(dim=1, index=src_idx).expand(B, r, c), src=src)
+
+        return out
+
+    return merge, unmerge
+
+
+def get_functions(x, ratio, original_shape):
+    b, c, original_h, original_w = original_shape
+    original_tokens = original_h * original_w
+    downsample = int(math.sqrt(original_tokens // x.shape[1]))
+    stride_x = 2
+    stride_y = 2
+    max_downsample = 1
+
+    if downsample <= max_downsample:
+        w = original_w // downsample
+        h = original_h // downsample
+        r = int(x.shape[1] * ratio)
+        no_rand = False
+        m, u = bipartite_soft_matching_random2d(x, w, h, stride_x, stride_y, r, no_rand)
+        return m, u
+
+    nothing = lambda y: y
+    return nothing, nothing
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -26,7 +26,7 @@ class CFGDenoiser(torch.nn.Module):

 #The main sampling function shared by all the samplers
 #Returns predicted noise
-def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, cond_concat=None):
+def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, cond_concat=None, model_options={}):
        def get_area_and_mult(cond, x_in, cond_concat_in, timestep_in):
            area = (x_in.shape[2], x_in.shape[3], 0, 0)
            strength = 1.0
@ -35,6 +35,10 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
            if 'strength' in cond[1]:
                strength = cond[1]['strength']

+            adm_cond = None
+            if 'adm' in cond[1]:
+                adm_cond = cond[1]['adm']
+
            input_x = x_in[:,:,area[2]:area[0] + area[2],area[3]:area[1] + area[3]]
            mult = torch.ones_like(input_x) * strength

@ -60,6 +64,9 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
                    cropped.append(cr)
                conditionning['c_concat'] = torch.cat(cropped, dim=1)

+            if adm_cond is not None:
+                conditionning['c_adm'] = adm_cond
+
            control = None
            if 'control' in cond[1]:
                control = cond[1]['control']
@ -76,6 +83,9 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
            if 'c_concat' in c1:
                if c1['c_concat'].shape != c2['c_concat'].shape:
                    return False
+            if 'c_adm' in c1:
+                if c1['c_adm'].shape != c2['c_adm'].shape:
+                    return False
            return True

        def can_concat_cond(c1, c2):
@ -92,19 +102,24 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
        def cond_cat(c_list):
            c_crossattn = []
            c_concat = []
+            c_adm = []
            for x in c_list:
                if 'c_crossattn' in x:
                    c_crossattn.append(x['c_crossattn'])
                if 'c_concat' in x:
                    c_concat.append(x['c_concat'])
+                if 'c_adm' in x:
+                    c_adm.append(x['c_adm'])
            out = {}
            if len(c_crossattn) > 0:
                out['c_crossattn'] = [torch.cat(c_crossattn)]
            if len(c_concat) > 0:
                out['c_concat'] = [torch.cat(c_concat)]
+            if len(c_adm) > 0:
+                out['c_adm'] = torch.cat(c_adm)
            return out

-        def calc_cond_uncond_batch(model_function, cond, uncond, x_in, timestep, max_total_area, cond_concat_in):
+        def calc_cond_uncond_batch(model_function, cond, uncond, x_in, timestep, max_total_area, cond_concat_in, model_options):
            out_cond = torch.zeros_like(x_in)
            out_count = torch.ones_like(x_in)/100000.0

@ -169,6 +184,9 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con
                if control is not None:
                    c['control'] = control.get_control(input_x, timestep_, c['c_crossattn'], len(cond_or_uncond))

+                if 'transformer_options' in model_options:
+                    c['transformer_options'] = model_options['transformer_options']
+
                output = model_function(input_x, timestep_, cond=c).chunk(batch_chunks)
                del input_x

@ -192,7 +210,7 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, con


        max_total_area = model_management.maximum_batch_area()
-        cond, uncond = calc_cond_uncond_batch(model_function, cond, uncond, x, timestep, max_total_area, cond_concat)
+        cond, uncond = calc_cond_uncond_batch(model_function, cond, uncond, x, timestep, max_total_area, cond_concat, model_options)
        return uncond + (cond - uncond) * cond_scale


@ -209,8 +227,8 @@ class CFGNoisePredictor(torch.nn.Module):
        super().__init__()
        self.inner_model = model
        self.alphas_cumprod = model.alphas_cumprod
-    def apply_model(self, x, timestep, cond, uncond, cond_scale, cond_concat=None):
-        out = sampling_function(self.inner_model.apply_model, x, timestep, uncond, cond, cond_scale, cond_concat)
+    def apply_model(self, x, timestep, cond, uncond, cond_scale, cond_concat=None, model_options={}):
+        out = sampling_function(self.inner_model.apply_model, x, timestep, uncond, cond, cond_scale, cond_concat, model_options=model_options)
        return out


@ -218,11 +236,11 @@ class KSamplerX0Inpaint(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.inner_model = model
-    def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, cond_concat=None):
+    def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, cond_concat=None, model_options={}):
        if denoise_mask is not None:
            latent_mask = 1. - denoise_mask
            x = x * denoise_mask + (self.latent_image + self.noise * sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1))) * latent_mask
-        out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, cond_concat=cond_concat)
+        out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, cond_concat=cond_concat, model_options=model_options)
        if denoise_mask is not None:
            out *= denoise_mask

@ -324,13 +342,37 @@ def apply_control_net_to_equal_area(conds, uncond):
            n['control'] = cond_cnets[x]
            uncond[temp[1]] = [o[0], n]

+def encode_adm(noise_augmentor, conds, batch_size, device):
+    for t in range(len(conds)):
+        x = conds[t]
+        if 'adm' in x[1]:
+            adm_inputs = []
+            weights = []
+            adm_in = x[1]["adm"]
+            for adm_c in adm_in:
+                adm_cond = adm_c[0].image_embeds
+                weight = adm_c[1]
+                c_adm, noise_level_emb = noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([0], device=device))
+                adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
+                weights.append(weight)
+                adm_inputs.append(adm_out)
+
+            adm_out = torch.stack(adm_inputs).sum(0)
+            #TODO: Apply Noise to Embedding Mix
+        else:
+            adm_out = torch.zeros((1, noise_augmentor.time_embed.dim * 2), device=device)
+        x[1] = x[1].copy()
+        x[1]["adm"] = torch.cat([adm_out] * batch_size)
+
+    return conds
+
 class KSampler:
    SCHEDULERS = ["karras", "normal", "simple", "ddim_uniform"]
    SAMPLERS = ["euler", "euler_ancestral", "heun", "dpm_2", "dpm_2_ancestral",
                "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde",
                "dpmpp_2m", "ddim", "uni_pc", "uni_pc_bh2"]

-    def __init__(self, model, steps, device, sampler=None, scheduler=None, denoise=None):
+    def __init__(self, model, steps, device, sampler=None, scheduler=None, denoise=None, model_options={}):
        self.model = model
        self.model_denoise = CFGNoisePredictor(self.model)
        if self.model.parameterization == "v":
@ -350,6 +392,7 @@ class KSampler:
        self.sigma_max=float(self.model_wrap.sigma_max)
        self.set_steps(steps, denoise)
        self.denoise = denoise
+        self.model_options = model_options

    def _calculate_sigmas(self, steps):
        sigmas = None
@ -418,10 +461,14 @@ class KSampler:
        else:
            precision_scope = contextlib.nullcontext

-        extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg}
+        if hasattr(self.model, 'noise_augmentor'): #unclip
+            positive = encode_adm(self.model.noise_augmentor, positive, noise.shape[0], self.device)
+            negative = encode_adm(self.model.noise_augmentor, negative, noise.shape[0], self.device)
+
+        extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": self.model_options}

        cond_concat = None
-        if hasattr(self.model, 'concat_keys'):
+        if hasattr(self.model, 'concat_keys'): #inpaint
            cond_concat = []
            for ck in self.model.concat_keys:
                if denoise_mask is not None:
@ -467,7 +514,7 @@ class KSampler:
                                                     x_T=z_enc,
                                                     x0=latent_image,
                                                     denoise_function=sampling_function,
-                                                     cond_concat=cond_concat,
+                                                     extra_args=extra_args,
                                                     mask=noise_mask,
                                                     to_zero=sigmas[-1]==0,
                                                     end_step=sigmas.shape[0] - 1)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1,5 +1,6 @@
 import torch
 import contextlib
+import copy

 import sd1_clip
 import sd2_clip
@ -11,6 +12,7 @@ from .cldm import cldm
 from .t2i_adapter import adapter

 from . import utils
+from . import clip_vision

 def load_torch_file(ckpt):
    if ckpt.lower().endswith(".safetensors"):
@ -52,6 +54,8 @@ def load_model_weights(model, sd, verbose=False, load_state_dict_to=[]):
        if x in sd:
            sd[keys_to_replace[x]] = sd.pop(x)

+    sd = utils.transformers_convert(sd, "cond_stage_model.model", "cond_stage_model.transformer.text_model", 24)
+
    resblock_to_replace = {
        "ln_1": "layer_norm1",
        "ln_2": "layer_norm2",
@ -122,7 +126,7 @@ LORA_UNET_MAP_RESNET = {
 }

 def load_lora(path, to_load):
-    lora = load_torch_file(path)
+    lora = utils.load_torch_file(path)
    patch_dict = {}
    loaded_keys = set()
    for x in to_load:
@ -274,12 +278,20 @@ class ModelPatcher:
        self.model = model
        self.patches = []
        self.backup = {}
+        self.model_options = {"transformer_options":{}}

    def clone(self):
        n = ModelPatcher(self.model)
        n.patches = self.patches[:]
+        n.model_options = copy.deepcopy(self.model_options)
        return n

+    def set_model_tomesd(self, ratio):
+        self.model_options["transformer_options"]["tomesd"] = {"ratio": ratio}
+
+    def model_dtype(self):
+        return self.model.diffusion_model.dtype
+
    def add_patches(self, patches, strength=1.0, block_weights={}):
        p = {}
        model_sd = self.model.state_dict()
@ -305,7 +317,6 @@ class ModelPatcher:
            for k in p:
                v = p[k][1]
                key = k
-
                if key not in model_sd:
                    print("could not patch. key doesn't exist in model:", k)
                    continue
@ -601,7 +612,7 @@ class ControlNet:
        return out

 def load_controlnet(ckpt_path, model=None):
-    controlnet_data = load_torch_file(ckpt_path)
+    controlnet_data = utils.load_torch_file(ckpt_path)
    pth_key = 'control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight'
    pth = False
    sd2 = False
@ -795,7 +806,7 @@ class StyleModel:


 def load_style_model(ckpt_path):
-    model_data = load_torch_file(ckpt_path)
+    model_data = utils.load_torch_file(ckpt_path)
    keys = model_data.keys()
    if "style_embedding" in keys:
        model = adapter.StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8)
@ -806,7 +817,7 @@ def load_style_model(ckpt_path):


 def load_clip(ckpt_path, embedding_directory=None):
-    clip_data = load_torch_file(ckpt_path)
+    clip_data = utils.load_torch_file(ckpt_path)
    config = {}
    if "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data:
        config['target'] = 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder'
@ -849,7 +860,7 @@ def load_checkpoint(config_path, ckpt_path, output_vae=True, output_clip=True, e
        load_state_dict_to = [w]

    model = instantiate_from_config(config["model"])
-    sd = load_torch_file(ckpt_path)
+    sd = utils.load_torch_file(ckpt_path)
    model = load_model_weights(model, sd, verbose=False, load_state_dict_to=load_state_dict_to)

    if fp16:
@ -858,10 +869,11 @@ def load_checkpoint(config_path, ckpt_path, output_vae=True, output_clip=True, e
    return (ModelPatcher(model), clip, vae)


-def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, embedding_directory=None):
-    sd = load_torch_file(ckpt_path)
+def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None):
+    sd = utils.load_torch_file(ckpt_path)
    sd_keys = sd.keys()
    clip = None
+    clipvision = None
    vae = None

    fp16 = model_management.should_use_fp16()
@ -886,6 +898,29 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, e
        w.cond_stage_model = clip.cond_stage_model
        load_state_dict_to = [w]

+    clipvision_key = "embedder.model.visual.transformer.resblocks.0.attn.in_proj_weight"
+    noise_aug_config = None
+    if clipvision_key in sd_keys:
+        size = sd[clipvision_key].shape[1]
+
+        if output_clipvision:
+            clipvision = clip_vision.load_clipvision_from_sd(sd)
+
+        noise_aug_key = "noise_augmentor.betas"
+        if noise_aug_key in sd_keys:
+            noise_aug_config = {}
+            params = {}
+            noise_schedule_config = {}
+            noise_schedule_config["timesteps"] = sd[noise_aug_key].shape[0]
+            noise_schedule_config["beta_schedule"] = "squaredcos_cap_v2"
+            params["noise_schedule_config"] = noise_schedule_config
+            noise_aug_config['target'] = "ldm.modules.encoders.noise_aug_modules.CLIPEmbeddingNoiseAugmentation"
+            if size == 1280: #h
+                params["timestep_dim"] = 1024
+            elif size == 1024: #l
+                params["timestep_dim"] = 768
+            noise_aug_config['params'] = params
+
    sd_config = {
        "linear_start": 0.00085,
        "linear_end": 0.012,
@ -934,7 +969,13 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, e
    sd_config["unet_config"] = {"target": "ldm.modules.diffusionmodules.openaimodel.UNetModel", "params": unet_config}
    model_config = {"target": "ldm.models.diffusion.ddpm.LatentDiffusion", "params": sd_config}

-    if unet_config["in_channels"] > 4: #inpainting model
+    if noise_aug_config is not None: #SD2.x unclip model
+        sd_config["noise_aug_config"] = noise_aug_config
+        sd_config["image_size"] = 96
+        sd_config["embedding_dropout"] = 0.25
+        sd_config["conditioning_key"] = 'crossattn-adm'
+        model_config["target"] = "ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion"
+    elif unet_config["in_channels"] > 4: #inpainting model
        sd_config["conditioning_key"] = "hybrid"
        sd_config["finetune_keys"] = None
        model_config["target"] = "ldm.models.diffusion.ddpm.LatentInpaintDiffusion"
@ -946,6 +987,11 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, e
    else:
        unet_config["num_heads"] = 8 #SD1.x

+    unclip = 'model.diffusion_model.label_emb.0.0.weight'
+    if unclip in sd_keys:
+        unet_config["num_classes"] = "sequential"
+        unet_config["adm_in_channels"] = sd[unclip].shape[1]
+
    if unet_config["context_dim"] == 1024 and unet_config["in_channels"] == 4: #only SD2.x non inpainting models are v prediction
        k = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.norm1.bias"
        out = sd[k]
@ -958,4 +1004,4 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, e
    if fp16:
        model = model.half()

-    return (ModelPatcher(model), clip, vae)
+    return (ModelPatcher(model), clip, vae, clipvision)
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -1,5 +1,47 @@
 import torch

+def load_torch_file(ckpt):
+    if ckpt.lower().endswith(".safetensors"):
+        import safetensors.torch
+        sd = safetensors.torch.load_file(ckpt, device="cpu")
+    else:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        if "state_dict" in pl_sd:
+            sd = pl_sd["state_dict"]
+        else:
+            sd = pl_sd
+    return sd
+
+def transformers_convert(sd, prefix_from, prefix_to, number):
+    resblock_to_replace = {
+        "ln_1": "layer_norm1",
+        "ln_2": "layer_norm2",
+        "mlp.c_fc": "mlp.fc1",
+        "mlp.c_proj": "mlp.fc2",
+        "attn.out_proj": "self_attn.out_proj",
+    }
+
+    for resblock in range(number):
+        for x in resblock_to_replace:
+            for y in ["weight", "bias"]:
+                k = "{}.transformer.resblocks.{}.{}.{}".format(prefix_from, resblock, x, y)
+                k_to = "{}.encoder.layers.{}.{}.{}".format(prefix_to, resblock, resblock_to_replace[x], y)
+                if k in sd:
+                    sd[k_to] = sd.pop(k)
+
+        for y in ["weight", "bias"]:
+            k_from = "{}.transformer.resblocks.{}.attn.in_proj_{}".format(prefix_from, resblock, y)
+            if k_from in sd:
+                weights = sd.pop(k_from)
+                shape_from = weights.shape[0] // 3
+                for x in range(3):
+                    p = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"]
+                    k_to = "{}.encoder.layers.{}.{}.{}".format(prefix_to, resblock, p[x], y)
+                    sd[k_to] = weights[shape_from*x:shape_from*(x + 1)]
+    return sd
+
 def common_upscale(samples, width, height, upscale_method, crop):
        if crop == "center":
            old_width = samples.shape[3]
--- a/comfy_extras/clip_vision.py
+++ b/comfy_extras/clip_vision.py
@ -1,32 +0,0 @@
-from transformers import CLIPVisionModel, CLIPVisionConfig, CLIPImageProcessor
-from comfy.sd import load_torch_file
-import os
-
-class ClipVisionModel():
-    def __init__(self):
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config.json")
-        config = CLIPVisionConfig.from_json_file(json_config)
-        self.model = CLIPVisionModel(config)
-        self.processor = CLIPImageProcessor(crop_size=224,
-                                            do_center_crop=True,
-                                            do_convert_rgb=True,
-                                            do_normalize=True,
-                                            do_resize=True,
-                                            image_mean=[ 0.48145466,0.4578275,0.40821073],
-                                            image_std=[0.26862954,0.26130258,0.27577711],
-                                            resample=3, #bicubic
-                                            size=224)
-
-    def load_sd(self, sd):
-        self.model.load_state_dict(sd, strict=False)
-
-    def encode_image(self, image):
-        inputs = self.processor(images=[image[0]], return_tensors="pt")
-        outputs = self.model(**inputs)
-        return outputs
-
-def load(ckpt_path):
-    clip_data = load_torch_file(ckpt_path)
-    clip = ClipVisionModel()
-    clip.load_sd(clip_data)
-    return clip
--- a/comfy_extras/nodes_upscale_model.py
+++ b/comfy_extras/nodes_upscale_model.py
@ -1,6 +1,5 @@
 import os
 from comfy_extras.chainner_models import model_loading
-from comfy.sd import load_torch_file
 import model_management
 import torch
 import comfy.utils
@ -18,7 +17,7 @@ class UpscaleModelLoader:

    def load_model(self, model_name):
        model_path = folder_paths.get_full_path("upscale_models", model_name)
-        sd = load_torch_file(model_path)
+        sd = comfy.utils.load_torch_file(model_path)
        out = model_loading.load_state_dict(sd).eval()
        return (out, )

--- a/custom_nodes/example_node.py.example
+++ b/custom_nodes/example_node.py.example
@ -11,6 +11,8 @@ class Example:
    ----------
    RETURN_TYPES (`tuple`): 
        The type of each element in the output tulple.
+    RETURN_NAMES (`tuple`):
+        Optional: The name of each output in the output tulple.
    FUNCTION (`str`):
        The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
    OUTPUT_NODE ([`bool`]):
@ -61,6 +63,8 @@ class Example:
        }

    RETURN_TYPES = ("IMAGE",)
+    #RETURN_NAMES = ("image_output_name",)
+
    FUNCTION = "test"

    #OUTPUT_NODE = False
--- a/main.py
+++ b/main.py
@ -11,9 +11,14 @@ if os.name == "nt":

 if __name__ == "__main__":
    if '--help' in sys.argv:
+        print()
        print("Valid Command line Arguments:")
        print("\t--listen [ip]\t\t\tListen on ip or 0.0.0.0 if none given so the UI can be accessed from other computers.")
        print("\t--port 8188\t\t\tSet the listen port.")
+        print()
+        print("\t--extra-model-paths-config file.yaml\tload an extra_model_paths.yaml file.")
+        print()
+        print()
        print("\t--dont-upcast-attention\t\tDisable upcasting of attention \n\t\t\t\t\tcan boost speed but increase the chances of black images.\n")
        print("\t--use-split-cross-attention\tUse the split cross attention optimization instead of the sub-quadratic one.\n\t\t\t\t\tIgnored when xformers is used.")
        print("\t--use-pytorch-cross-attention\tUse the new pytorch 2.0 cross attention function.")
@ -40,6 +45,7 @@ if __name__ == "__main__":
    except:
        pass

+from nodes import init_custom_nodes
 import execution
 import server
 import folder_paths
@ -98,6 +104,8 @@ if __name__ == "__main__":
    server = server.PromptServer(loop)
    q = execution.PromptQueue(server)

+    init_custom_nodes()
+    server.add_routes()
    hijack_progress(server)

    threading.Thread(target=prompt_worker, daemon=True, args=(q,server,)).start()
@ -113,7 +121,6 @@ if __name__ == "__main__":
    except:
        address = '127.0.0.1'

-
    dont_print = False
    if '--dont-print-server' in sys.argv:
        dont_print = True
--- a/nodes.py
+++ b/nodes.py
@ -18,7 +18,7 @@ import comfy.samplers
 import comfy.sd
 import comfy.utils

-import comfy_extras.clip_vision
+import comfy.clip_vision

 import model_management
 import importlib
@ -219,6 +219,21 @@ class CheckpointLoaderSimple:
        out = comfy.sd.load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, embedding_directory=folder_paths.get_folder_paths("embeddings"))
        return out

+class unCLIPCheckpointLoader:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "ckpt_name": (folder_paths.get_filename_list("checkpoints"), ),
+                             }}
+    RETURN_TYPES = ("MODEL", "CLIP", "VAE", "CLIP_VISION")
+    FUNCTION = "load_checkpoint"
+
+    CATEGORY = "_for_testing/unclip"
+
+    def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True):
+        ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
+        out = comfy.sd.load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=True, embedding_directory=folder_paths.get_folder_paths("embeddings"))
+        return out
+
 class CLIPSetLastLayer:
    @classmethod
    def INPUT_TYPES(s):
@ -330,6 +345,22 @@ class LoraLoaderBlockWeights:
        model_lora, clip_lora = comfy.sd.load_lora_for_models(model, clip, lora_path, strength_model, strength_clip, block_weights)
        return (model_lora, clip_lora)

+class TomePatchModel:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "model": ("MODEL",),
+                              "ratio": ("FLOAT", {"default": 0.3, "min": 0.0, "max": 1.0, "step": 0.01}),
+                              }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+
+    CATEGORY = "_for_testing"
+
+    def patch(self, model, ratio):
+        m = model.clone()
+        m.set_model_tomesd(ratio)
+        return (m, )
+
 class VAELoader:
    @classmethod
    def INPUT_TYPES(s):
@ -430,7 +461,7 @@ class CLIPVisionLoader:

    def load_clip(self, clip_name):
        clip_path = folder_paths.get_full_path("clip_vision", clip_name)
-        clip_vision = comfy_extras.clip_vision.load(clip_path)
+        clip_vision = comfy.clip_vision.load(clip_path)
        return (clip_vision,)

 class CLIPVisionEncode:
@ -442,7 +473,7 @@ class CLIPVisionEncode:
    RETURN_TYPES = ("CLIP_VISION_OUTPUT",)
    FUNCTION = "encode"

-    CATEGORY = "conditioning/style_model"
+    CATEGORY = "conditioning"

    def encode(self, clip_vision, image):
        output = clip_vision.encode_image(image)
@ -484,6 +515,32 @@ class StyleModelApply:
            c.append(n)
        return (c, )

+class unCLIPConditioning:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"conditioning": ("CONDITIONING", ),
+                             "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
+                             "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}),
+                             }}
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "apply_adm"
+
+    CATEGORY = "_for_testing/unclip"
+
+    def apply_adm(self, conditioning, clip_vision_output, strength):
+        c = []
+        for t in conditioning:
+            o = t[1].copy()
+            x = (clip_vision_output, strength)
+            if "adm" in o:
+                o["adm"] = o["adm"][:] + [x]
+            else:
+                o["adm"] = [x]
+            n = [t[0], o]
+            c.append(n)
+        return (c, )
+
+
 class EmptyLatentImage:
    def __init__(self, device="cpu"):
        self.device = device
@ -722,7 +779,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
    model_management.load_controlnet_gpu(control_net_models)

    if sampler_name in comfy.samplers.KSampler.SAMPLERS:
-        sampler = comfy.samplers.KSampler(real_model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=denoise)
+        sampler = comfy.samplers.KSampler(real_model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
    else:
        #other samplers
        pass
@ -1086,6 +1143,7 @@ NODE_CLASS_MAPPINGS = {
    "CLIPLoader": CLIPLoader,
    "CLIPVisionEncode": CLIPVisionEncode,
    "StyleModelApply": StyleModelApply,
+    "unCLIPConditioning": unCLIPConditioning,
    "ControlNetApply": ControlNetApply,
    "ControlNetLoader": ControlNetLoader,
    "DiffControlNetLoader": DiffControlNetLoader,
@ -1093,6 +1151,8 @@ NODE_CLASS_MAPPINGS = {
    "CLIPVisionLoader": CLIPVisionLoader,
    "VAEDecodeTiled": VAEDecodeTiled,
    "VAEEncodeTiled": VAEEncodeTiled,
+    "TomePatchModel": TomePatchModel,
+    "unCLIPCheckpointLoader": unCLIPCheckpointLoader,
 }

 def load_custom_node(module_path):
@ -1127,6 +1187,6 @@ def load_custom_nodes():
        if os.path.isfile(module_path) and os.path.splitext(module_path)[1] != ".py": continue
        load_custom_node(module_path)

-load_custom_nodes()
-
-load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_upscale_model.py"))
+def init_custom_nodes():
+    load_custom_nodes()
+    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_upscale_model.py"))
--- a/server.py
+++ b/server.py
@ -42,6 +42,7 @@ class PromptServer():
        self.web_root = os.path.join(os.path.dirname(
            os.path.realpath(__file__)), "web")
        routes = web.RouteTableDef()
+        self.routes = routes
        self.last_node_id = None
        self.client_id = None

@ -239,8 +240,9 @@ class PromptServer():
                    self.prompt_queue.delete_history_item(id_to_delete)

            return web.Response(status=200)
-
-        self.app.add_routes(routes)
+        
+    def add_routes(self):
+        self.app.add_routes(self.routes)
        self.app.add_routes([
            web.static('/', self.web_root),
        ])
--- a/web/extensions/core/rerouteNode.js
+++ b/web/extensions/core/rerouteNode.js
@ -43,8 +43,15 @@ app.registerExtension({
 							const node = app.graph.getNodeById(link.origin_id);
 							const type = node.constructor.type;
 							if (type === "Reroute") {
+								if (node === this) {
+									// We've found a circle
+									currentNode.disconnectInput(link.target_slot);
+									currentNode = null;
+								}
+								else {
 								// Move the previous node
-								currentNode = node;
+									currentNode = node;
+								}
 							} else {
 								// We've found the end
 								inputNode = currentNode;
--- a/web/extensions/core/slotDefaults.js
+++ b/web/extensions/core/slotDefaults.js
@ -0,0 +1,21 @@
+import { app } from "/scripts/app.js";
+
+// Adds defaults for quickly adding nodes with middle click on the input/output
+
+app.registerExtension({
+	name: "Comfy.SlotDefaults",
+	init() {
+		LiteGraph.middle_click_slot_add_default_node = true;
+		LiteGraph.slot_types_default_in = {
+			MODEL: "CheckpointLoaderSimple",
+			LATENT: "EmptyLatentImage",
+			VAE: "VAELoader",
+		};
+
+		LiteGraph.slot_types_default_out = {
+			LATENT: "VAEDecode",
+			IMAGE: "SaveImage",
+			CLIP: "CLIPTextEncode",
+		};
+	},
+});
--- a/web/extensions/core/snapToGrid.js
+++ b/web/extensions/core/snapToGrid.js
@ -0,0 +1,89 @@
+import { app } from "/scripts/app.js";
+
+// Shift + drag/resize to snap to grid
+
+app.registerExtension({
+	name: "Comfy.SnapToGrid",
+	init() {
+		// Add setting to control grid size
+		app.ui.settings.addSetting({
+			id: "Comfy.SnapToGrid.GridSize",
+			name: "Grid Size",
+			type: "number",
+			attrs: {
+				min: 1,
+				max: 500,
+			},
+			tooltip:
+				"When dragging and resizing nodes while holding shift they will be aligned to the grid, this controls the size of that grid.",
+			defaultValue: LiteGraph.CANVAS_GRID_SIZE,
+			onChange(value) {
+				LiteGraph.CANVAS_GRID_SIZE = +value;
+			},
+		});
+
+		// After moving a node, if the shift key is down align it to grid
+		const onNodeMoved = app.canvas.onNodeMoved;
+		app.canvas.onNodeMoved = function (node) {
+			const r = onNodeMoved?.apply(this, arguments);
+
+			if (app.shiftDown) {
+				// Ensure all selected nodes are realigned
+				for (const id in this.selected_nodes) {
+					this.selected_nodes[id].alignToGrid();
+				}
+			}
+
+			return r;
+		};
+
+		// When a node is added, add a resize handler to it so we can fix align the size with the grid
+		const onNodeAdded = app.graph.onNodeAdded;
+		app.graph.onNodeAdded = function (node) {
+			const onResize = node.onResize;
+			node.onResize = function () {
+				if (app.shiftDown) {
+					const w = LiteGraph.CANVAS_GRID_SIZE * Math.round(node.size[0] / LiteGraph.CANVAS_GRID_SIZE);
+					const h = LiteGraph.CANVAS_GRID_SIZE * Math.round(node.size[1] / LiteGraph.CANVAS_GRID_SIZE);
+					node.size[0] = w;
+					node.size[1] = h;
+				}
+				return onResize?.apply(this, arguments);
+			};
+			return onNodeAdded?.apply(this, arguments);
+		};
+
+		// Draw a preview of where the node will go if holding shift and the node is selected
+		const origDrawNode = LGraphCanvas.prototype.drawNode;
+		LGraphCanvas.prototype.drawNode = function (node, ctx) {
+			if (app.shiftDown && this.node_dragged && node.id in this.selected_nodes) {
+				const x = LiteGraph.CANVAS_GRID_SIZE * Math.round(node.pos[0] / LiteGraph.CANVAS_GRID_SIZE);
+				const y = LiteGraph.CANVAS_GRID_SIZE * Math.round(node.pos[1] / LiteGraph.CANVAS_GRID_SIZE);
+
+				const shiftX = x - node.pos[0];
+				let shiftY = y - node.pos[1];
+
+				let w, h;
+				if (node.flags.collapsed) {
+					w = node._collapsed_width;
+					h = LiteGraph.NODE_TITLE_HEIGHT;
+					shiftY -= LiteGraph.NODE_TITLE_HEIGHT;
+				} else {
+					w = node.size[0];
+					h = node.size[1];
+					let titleMode = node.constructor.title_mode;
+					if (titleMode !== LiteGraph.TRANSPARENT_TITLE && titleMode !== LiteGraph.NO_TITLE) {
+						h += LiteGraph.NODE_TITLE_HEIGHT;
+						shiftY -= LiteGraph.NODE_TITLE_HEIGHT;
+					}
+				}
+				const f = ctx.fillStyle;
+				ctx.fillStyle = "rgba(100, 100, 100, 0.5)";
+				ctx.fillRect(shiftX, shiftY, w, h);
+				ctx.fillStyle = f;
+			}
+
+			return origDrawNode.apply(this, arguments);
+		};
+	},
+});
--- a/web/extensions/core/widgetInputs.js
+++ b/web/extensions/core/widgetInputs.js
@ -20,7 +20,7 @@ function hideWidget(node, widget, suffix = "") {
 		if (link == null) {
 			return undefined;
 		}
-		return widget.value;
+		return widget.origSerializeValue ? widget.origSerializeValue() : widget.value;
 	};

 	// Hide any linked widgets, e.g. seed+randomize
@ -101,7 +101,7 @@ app.registerExtension({
 							callback: () => convertToWidget(this, w),
 						});
 					} else {
-						const config = nodeData?.input?.required[w.name] || [w.type, w.options || {}];
+						const config = nodeData?.input?.required[w.name] || nodeData?.input?.optional?.[w.name] || [w.type, w.options || {}];
 						if (isConvertableWidget(w, config)) {
 							toInput.push({
 								content: `Convert ${w.name} to input`,
--- a/web/scripts/app.js
+++ b/web/scripts/app.js
@ -5,10 +5,20 @@ import { defaultGraph } from "./defaultGraph.js";
 import { getPngMetadata, importA1111 } from "./pnginfo.js";

 class ComfyApp {
+	/** 
+	 * List of {number, batchCount} entries to queue
+	 */
+	#queueItems = [];
+	/**
+	 * If the queue is currently being processed
+	 */
+	#processingQueue = false;
+
 	constructor() {
 		this.ui = new ComfyUI(this);
 		this.extensions = [];
 		this.nodeOutputs = {};
+		this.shiftDown = false;
 	}

 	/**
@ -628,11 +638,16 @@ class ComfyApp {

 	#addKeyboardHandler() {
 		window.addEventListener("keydown", (e) => {
+			this.shiftDown = e.shiftKey;
+
 			// Queue prompt using ctrl or command + enter
 			if ((e.ctrlKey || e.metaKey) && (e.key === "Enter" || e.keyCode === 13 || e.keyCode === 10)) {
 				this.queuePrompt(e.shiftKey ? -1 : 0);
 			}
 		});
+		window.addEventListener("keyup", (e) => {
+			this.shiftDown = e.shiftKey;
+		});
 	}

 	/**
@ -667,6 +682,9 @@ class ComfyApp {
 		const canvas = (this.canvas = new LGraphCanvas(canvasEl, this.graph));
 		this.ctx = canvasEl.getContext("2d");

+		LiteGraph.release_link_on_empty_shows_menu = true;
+		LiteGraph.alt_drag_do_clone_nodes = true;
+
 		this.graph.start();

 		function resizeCanvas() {
@ -802,7 +820,7 @@ class ComfyApp {
 		this.clean();

 		if (!graphData) {
-			graphData = defaultGraph;
+			graphData = structuredClone(defaultGraph);
 		}

 		// Patch T2IAdapterLoader to ControlNetLoader since they are the same node now
@ -915,31 +933,47 @@ class ComfyApp {
 	}

 	async queuePrompt(number, batchCount = 1) {
-		for (let i = 0; i < batchCount; i++) {
-			const p = await this.graphToPrompt();
+		this.#queueItems.push({ number, batchCount });

-			try {
-				await api.queuePrompt(number, p);
-			} catch (error) {
-				this.ui.dialog.show(error.response || error.toString());
-				return;
-			}
+		// Only have one action process the items so each one gets a unique seed correctly
+		if (this.#processingQueue) {
+			return;
+		}
+	
+		this.#processingQueue = true;
+		try {
+			while (this.#queueItems.length) {
+				({ number, batchCount } = this.#queueItems.pop());

-			for (const n of p.workflow.nodes) {
-				const node = graph.getNodeById(n.id);
-				if (node.widgets) {
-					for (const widget of node.widgets) {
-						// Allow widgets to run callbacks after a prompt has been queued
-						// e.g. random seed after every gen
-						if (widget.afterQueued) {
-							widget.afterQueued();
+				for (let i = 0; i < batchCount; i++) {
+					const p = await this.graphToPrompt();
+
+					try {
+						await api.queuePrompt(number, p);
+					} catch (error) {
+						this.ui.dialog.show(error.response || error.toString());
+						break;
+					}
+
+					for (const n of p.workflow.nodes) {
+						const node = graph.getNodeById(n.id);
+						if (node.widgets) {
+							for (const widget of node.widgets) {
+								// Allow widgets to run callbacks after a prompt has been queued
+								// e.g. random seed after every gen
+								if (widget.afterQueued) {
+									widget.afterQueued();
+								}
+							}
 						}
 					}
+
+					this.canvas.draw(true, true);
+					await this.ui.queue.update();
 				}
 			}
-
-			this.canvas.draw(true, true);
-			await this.ui.queue.update();
+		} finally {
+			this.#processingQueue = false;
 		}
 	}

--- a/web/scripts/ui.js
+++ b/web/scripts/ui.js
@ -35,21 +35,87 @@ export function $el(tag, propsOrChildren, children) {
 	return element;
 }

-function dragElement(dragEl) {
+function dragElement(dragEl, settings) {
 	var posDiffX = 0,
 		posDiffY = 0,
 		posStartX = 0,
 		posStartY = 0,
 		newPosX = 0,
 		newPosY = 0;
-	if (dragEl.getElementsByClassName('drag-handle')[0]) {
+	if (dragEl.getElementsByClassName("drag-handle")[0]) {
 		// if present, the handle is where you move the DIV from:
-		dragEl.getElementsByClassName('drag-handle')[0].onmousedown = dragMouseDown;
+		dragEl.getElementsByClassName("drag-handle")[0].onmousedown = dragMouseDown;
 	} else {
 		// otherwise, move the DIV from anywhere inside the DIV:
 		dragEl.onmousedown = dragMouseDown;
 	}

+	// When the element resizes (e.g. view queue) ensure it is still in the windows bounds
+	const resizeObserver = new ResizeObserver(() => {
+		ensureInBounds();
+	}).observe(dragEl);
+
+	function ensureInBounds() {
+		if (dragEl.classList.contains("comfy-menu-manual-pos")) {
+			newPosX = Math.min(document.body.clientWidth - dragEl.clientWidth, Math.max(0, dragEl.offsetLeft));
+			newPosY = Math.min(document.body.clientHeight - dragEl.clientHeight, Math.max(0, dragEl.offsetTop));
+
+			positionElement();
+		}
+	}
+
+	function positionElement() {
+		const halfWidth = document.body.clientWidth / 2;
+		const anchorRight = newPosX + dragEl.clientWidth / 2 > halfWidth;
+
+		// set the element's new position:
+		if (anchorRight) {
+			dragEl.style.left = "unset";
+			dragEl.style.right = document.body.clientWidth - newPosX - dragEl.clientWidth + "px";
+		} else {
+			dragEl.style.left = newPosX + "px";
+			dragEl.style.right = "unset";
+		}
+		
+		dragEl.style.top = newPosY + "px";
+		dragEl.style.bottom = "unset";
+
+		if (savePos) {
+			localStorage.setItem(
+				"Comfy.MenuPosition",
+				JSON.stringify({
+					x: dragEl.offsetLeft,
+					y: dragEl.offsetTop,
+				})
+			);
+		}
+	}
+
+	function restorePos() {
+		let pos = localStorage.getItem("Comfy.MenuPosition");
+		if (pos) {
+			pos = JSON.parse(pos);
+			newPosX = pos.x;
+			newPosY = pos.y;
+			positionElement();
+			ensureInBounds();
+		}
+	}
+
+	let savePos = undefined;
+	settings.addSetting({
+		id: "Comfy.MenuPosition",
+		name: "Save menu position",
+		type: "boolean",
+		defaultValue: savePos,
+		onChange(value) {
+			if (savePos === undefined && value) {
+				restorePos();
+			}
+			savePos = value;
+		},
+	});
+
 	function dragMouseDown(e) {
 		e = e || window.event;
 		e.preventDefault();
@ -64,18 +130,25 @@ function dragElement(dragEl) {
 	function elementDrag(e) {
 		e = e || window.event;
 		e.preventDefault();
+
+		dragEl.classList.add("comfy-menu-manual-pos");
+
 		// calculate the new cursor position:
 		posDiffX = e.clientX - posStartX;
 		posDiffY = e.clientY - posStartY;
 		posStartX = e.clientX;
 		posStartY = e.clientY;
-		newPosX = Math.min((document.body.clientWidth - dragEl.clientWidth), Math.max(0, (dragEl.offsetLeft + posDiffX)));
-		newPosY = Math.min((document.body.clientHeight - dragEl.clientHeight), Math.max(0, (dragEl.offsetTop + posDiffY)));
-		// set the element's new position:
-		dragEl.style.top = newPosY + "px";
-		dragEl.style.left = newPosX + "px";
+
+		newPosX = Math.min(document.body.clientWidth - dragEl.clientWidth, Math.max(0, dragEl.offsetLeft + posDiffX));
+		newPosY = Math.min(document.body.clientHeight - dragEl.clientHeight, Math.max(0, dragEl.offsetTop + posDiffY));
+
+		positionElement();
 	}

+	window.addEventListener("resize", () => {
+			ensureInBounds();
+	});
+
 	function closeDragElement() {
 		// stop moving when mouse button is released:
 		document.onmouseup = null;
@ -125,7 +198,7 @@ class ComfySettingsDialog extends ComfyDialog {
 		localStorage[settingId] = JSON.stringify(value);
 	}

-	addSetting({ id, name, type, defaultValue, onChange }) {
+	addSetting({ id, name, type, defaultValue, onChange, attrs = {}, tooltip = "", }) {
 		if (!id) {
 			throw new Error("Settings must have an ID");
 		}
@ -152,42 +225,72 @@ class ComfySettingsDialog extends ComfyDialog {
 					value = v;
 				};

+				let element;
+
 				if (typeof type === "function") {
-					return type(name, setter, value);
+					element = type(name, setter, value, attrs);
+				} else {
+					switch (type) {
+						case "boolean":
+							element = $el("div", [
+								$el("label", { textContent: name || id }, [
+									$el("input", {
+										type: "checkbox",
+										checked: !!value,
+										oninput: (e) => {
+											setter(e.target.checked);
+										},
+										...attrs
+									}),
+								]),
+							]);
+							break;
+						case "number":
+							element = $el("div", [
+								$el("label", { textContent: name || id }, [
+									$el("input", {
+										type,
+										value,
+										oninput: (e) => {
+											setter(e.target.value);
+										},
+										...attrs
+									}),
+								]),
+							]);
+							break;
+						default:
+							console.warn("Unsupported setting type, defaulting to text");
+							element = $el("div", [
+								$el("label", { textContent: name || id }, [
+									$el("input", {
+										value,
+										oninput: (e) => {
+											setter(e.target.value);
+										},
+										...attrs
+									}),
+								]),
+							]);
+							break;
+					}
+				}
+				if(tooltip) {
+					element.title = tooltip;
 				}

-				switch (type) {
-					case "boolean":
-						return $el("div", [
-							$el("label", { textContent: name || id }, [
-								$el("input", {
-									type: "checkbox",
-									checked: !!value,
-									oninput: (e) => {
-										setter(e.target.checked);
-									},
-								}),
-							]),
-						]);
-					default:
-						console.warn("Unsupported setting type, defaulting to text");
-						return $el("div", [
-							$el("label", { textContent: name || id }, [
-								$el("input", {
-									value,
-									oninput: (e) => {
-										setter(e.target.value);
-									},
-								}),
-							]),
-						]);
-				}
+				return element;
 			},
 		});
 	}

 	show() {
 		super.show();
+		Object.assign(this.textElement.style, {
+			display: "flex",
+			flexDirection: "column",
+			gap: "10px"
+		});
 		this.textElement.replaceChildren(...this.settings.map((s) => s.render()));
 	}
 }
@ -225,10 +328,10 @@ class ComfyList {
 							$el("button", {
 								textContent: "Load",
 								onclick: () => {
+									app.loadGraphData(item.prompt[3].extra_pnginfo.workflow);
 									if (item.outputs) {
 										app.nodeOutputs = item.outputs;
 									}
-									app.loadGraphData(item.prompt[3].extra_pnginfo.workflow);
 								},
 							}),
 							$el("button", {
@ -316,34 +419,52 @@ export class ComfyUI {
 				$el("span", { $: (q) => (this.queueSize = q) }),
 				$el("button.comfy-settings-btn", { textContent: "⚙️", onclick: () => this.settings.show() }),
 			]),
-			$el("button.comfy-queue-btn", { textContent: "Queue Prompt", onclick: () => app.queuePrompt(0, this.batchCount) }),
+			$el("button.comfy-queue-btn", {
+				textContent: "Queue Prompt",
+				onclick: () => app.queuePrompt(0, this.batchCount),
+			}),
 			$el("div", {}, [
-				$el("label", { innerHTML: "Extra options"}, [
-					$el("input", { type: "checkbox", 
-						onchange: (i) => { 
-							document.getElementById('extraOptions').style.display = i.srcElement.checked ? "block" : "none";
-							this.batchCount = i.srcElement.checked ? document.getElementById('batchCountInputRange').value : 1;
-							document.getElementById('autoQueueCheckbox').checked = false;
-						}
-					})
-				])
-			]),
-			$el("div", { id: "extraOptions", style: { width: "100%", display: "none" }}, [
-				$el("label", { innerHTML: "Batch count" }, [
-					$el("input", { id: "batchCountInputNumber", type: "number", value: this.batchCount, min: "1", style: { width: "35%", "margin-left": "0.4em" }, 
-						oninput: (i) => { 
-							this.batchCount = i.target.value;
-							document.getElementById('batchCountInputRange').value = this.batchCount;
-						}
+				$el("label", { innerHTML: "Extra options" }, [
+					$el("input", {
+						type: "checkbox",
+						onchange: (i) => {
+							document.getElementById("extraOptions").style.display = i.srcElement.checked ? "block" : "none";
+							this.batchCount = i.srcElement.checked ? document.getElementById("batchCountInputRange").value : 1;
+							document.getElementById("autoQueueCheckbox").checked = false;
+						},
 					}),
-					$el("input", { id: "batchCountInputRange", type: "range", min: "1", max: "100", value: this.batchCount, 
+				]),
+			]),
+			$el("div", { id: "extraOptions", style: { width: "100%", display: "none" } }, [
+				$el("label", { innerHTML: "Batch count" }, [
+					$el("input", {
+						id: "batchCountInputNumber",
+						type: "number",
+						value: this.batchCount,
+						min: "1",
+						style: { width: "35%", "margin-left": "0.4em" },
+						oninput: (i) => {
+							this.batchCount = i.target.value;
+							document.getElementById("batchCountInputRange").value = this.batchCount;
+						},
+					}),
+					$el("input", {
+						id: "batchCountInputRange",
+						type: "range",
+						min: "1",
+						max: "100",
+						value: this.batchCount,
 						oninput: (i) => {
 							this.batchCount = i.srcElement.value;
-							document.getElementById('batchCountInputNumber').value = i.srcElement.value;
-						}
+							document.getElementById("batchCountInputNumber").value = i.srcElement.value;
+						},
+					}),
+					$el("input", {
+						id: "autoQueueCheckbox",
+						type: "checkbox",
+						checked: false,
+						title: "automatically queue prompt when the queue size hits 0",
 					}),
-					$el("input", { id: "autoQueueCheckbox", type: "checkbox", checked: false, title: "automatically queue prompt when the queue size hits 0",
-					})
 				]),
 			]),
 			$el("div.comfy-menu-btns", [
@ -395,7 +516,7 @@ export class ComfyUI {
 			$el("button", { textContent: "Load Default", onclick: () => app.loadGraphData() }),
 		]);

-		dragElement(this.menuContainer);
+		dragElement(this.menuContainer, this.settings);

 		this.setStatus({ exec_info: { queue_remaining: "X" } });
 	}
@ -403,10 +524,14 @@ export class ComfyUI {
 	setStatus(status) {
 		this.queueSize.textContent = "Queue size: " + (status ? status.exec_info.queue_remaining : "ERR");
 		if (status) {
-			if (this.lastQueueSize != 0 && status.exec_info.queue_remaining == 0 && document.getElementById('autoQueueCheckbox').checked) {
+			if (
+				this.lastQueueSize != 0 &&
+				status.exec_info.queue_remaining == 0 &&
+				document.getElementById("autoQueueCheckbox").checked
+			) {
 				app.queuePrompt(0, this.batchCount);
 			}
-			this.lastQueueSize = status.exec_info.queue_remaining
+			this.lastQueueSize = status.exec_info.queue_remaining;
 		}
 	}
 }