diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index 421dd5ee4..444d6b254 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -20,9 +20,9 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml
new file mode 100644
index 000000000..aafe8a214
--- /dev/null
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -0,0 +1,67 @@
+name: "Windows Release dependencies"
+
+on:
+  workflow_dispatch:
+    inputs:
+      xformers:
+        description: 'xformers version'
+        required: false
+        type: string
+        default: ""
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "121"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "11"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "6"
+#  push:
+#    branches:
+#      - master
+
+jobs:
+  build_dependencies:
+    runs-on: windows-latest
+    steps:
+        - uses: actions/checkout@v3
+        - uses: actions/setup-python@v4
+          with:
+            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
+
+        - shell: bash
+          run: |
+            echo "@echo off
+            ..\python_embeded\python.exe .\update.py ..\ComfyUI\\
+            echo -
+            echo This will try to update pytorch and all python dependencies, if you get an error wait for pytorch/xformers to fix their stuff
+            echo You should not be running this anyways unless you really have to
+            echo -
+            echo If you just want to update normally, close this and run update_comfyui.bat instead.
+            echo -
+            pause
+            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
+            pause" > update_comfyui_and_python_dependencies.bat
+
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip install --no-cache-dir ./temp_wheel_dir/*
+            echo installed basic
+            ls -lah temp_wheel_dir
+            mv temp_wheel_dir cu${{ inputs.cu }}_python_deps
+            tar cf cu${{ inputs.cu }}_python_deps.tar cu${{ inputs.cu }}_python_deps
+
+        - uses: actions/cache/save@v3
+          with:
+            path: |
+              cu${{ inputs.cu }}_python_deps.tar
+              update_comfyui_and_python_dependencies.bat
+            key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml
index 319942e7c..b793f7fe2 100644
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -20,12 +20,12 @@ jobs:
             persist-credentials: false
         - uses: actions/setup-python@v4
           with:
-            python-version: '3.11.3'
+            python-version: '3.11.6'
         - shell: bash
           run: |
             cd ..
             cp -r ComfyUI ComfyUI_copy
-            curl https://www.python.org/ftp/python/3.11.3/python-3.11.3-embed-amd64.zip -o python_embeded.zip
+            curl https://www.python.org/ftp/python/3.11.6/python-3.11.6-embed-amd64.zip -o python_embeded.zip
             unzip python_embeded.zip -d python_embeded
             cd python_embeded
             echo 'import site' >> ./python311._pth
diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml
new file mode 100644
index 000000000..87d37c24d
--- /dev/null
+++ b/.github/workflows/windows_release_package.yml
@@ -0,0 +1,100 @@
+name: "Windows Release packaging"
+
+on:
+  workflow_dispatch:
+    inputs:
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "121"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "11"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "6"
+#  push:
+#    branches:
+#      - master
+
+jobs:
+  package_comfyui:
+    permissions:
+        contents: "write"
+        packages: "write"
+        pull-requests: "read"
+    runs-on: windows-latest
+    steps:
+        - uses: actions/cache/restore@v3
+          id: cache
+          with:
+            path: |
+              cu${{ inputs.cu }}_python_deps.tar
+              update_comfyui_and_python_dependencies.bat
+            key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
+        - shell: bash
+          run: |
+            mv cu${{ inputs.cu }}_python_deps.tar ../
+            mv update_comfyui_and_python_dependencies.bat ../
+            cd ..
+            tar xf cu${{ inputs.cu }}_python_deps.tar
+            pwd
+            ls
+
+        - uses: actions/checkout@v3
+          with:
+            fetch-depth: 0
+            persist-credentials: false
+        - shell: bash
+          run: |
+            cd ..
+            cp -r ComfyUI ComfyUI_copy
+            curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
+            unzip python_embeded.zip -d python_embeded
+            cd python_embeded
+            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
+            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+            ./python.exe get-pip.py
+            ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
+            cd ..
+
+            git clone https://github.com/comfyanonymous/taesd
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
+
+            mkdir ComfyUI_windows_portable
+            mv python_embeded ComfyUI_windows_portable
+            mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI
+
+            cd ComfyUI_windows_portable
+
+            mkdir update
+            cp -r ComfyUI/.ci/update_windows/* ./update/
+            cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp ../update_comfyui_and_python_dependencies.bat ./update/
+
+            cd ..
+
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z
+
+            cd ComfyUI_windows_portable
+            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
+
+            ls
+
+        - name: Upload binaries to release
+          uses: svenstaro/upload-release-action@v2
+          with:
+                repo_token: ${{ secrets.GITHUB_TOKEN }}
+                file: new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z
+                tag: "latest"
+                overwrite: true
+
diff --git a/README.md b/README.md
index d83b4bdac..925caa732 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ComfyUI
 =======
-A powerful and modular stable diffusion GUI and backend.
+The most powerful and modular stable diffusion GUI and backend.
 -----------
 ![ComfyUI Screenshot](comfyui_screenshot.png)
 
@@ -69,7 +69,7 @@ Ctrl can also be replaced with Cmd instead for macOS users
 
 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).
 
-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
 
 Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
 
@@ -92,16 +92,16 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
 
-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.4.2```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6```
 
-This is the command to install the nightly with ROCm 5.6 that supports the 7000 series and might have some performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.6```
+This is the command to install the nightly with ROCm 5.7 that might have some performance improvements:
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7```
 
 ### NVIDIA
 
-Nvidia users should install torch and xformers using this command:
+Nvidia users should install pytorch using this command:
 
-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 xformers```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```
 
 #### Troubleshooting
 
diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index af0df103e..ea219c7e5 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -354,7 +354,7 @@ def load_controlnet(ckpt_path, model=None):
 
     if controlnet_config is None:
         use_fp16 = comfy.model_management.should_use_fp16()
-        controlnet_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, use_fp16).unet_config
+        controlnet_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, use_fp16, True).unet_config
     controlnet_config.pop("out_channels")
     controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
     control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
diff --git a/comfy/extra_samplers/uni_pc.py b/comfy/extra_samplers/uni_pc.py
index 7eaf6ff62..7e88bb9fa 100644
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -688,7 +688,7 @@ class UniPC:
                 x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
         else:
             x_t_ = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dimss) * x
+                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
                 - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
             )
             if x_t is None:
diff --git a/comfy/ldm/models/diffusion/ddim.py b/comfy/ldm/models/diffusion/ddim.py
index befab0075..433d48e30 100644
--- a/comfy/ldm/models/diffusion/ddim.py
+++ b/comfy/ldm/models/diffusion/ddim.py
@@ -59,7 +59,7 @@ class DDIMSampler(object):
     @torch.no_grad()
     def sample_custom(self,
                       ddim_timesteps,
-                      conditioning,
+                      conditioning=None,
                       callback=None,
                       img_callback=None,
                       quantize_x0=False,
diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 34484b288..fcae6b66a 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -538,6 +538,8 @@ class BasicTransformerBlock(nn.Module):
         if "block" in transformer_options:
             block = transformer_options["block"]
             extra_options["block"] = block
+        if "cond_or_uncond" in transformer_options:
+            extra_options["cond_or_uncond"] = transformer_options["cond_or_uncond"]
         if "patches" in transformer_options:
             transformer_patches = transformer_options["patches"]
         else:
diff --git a/comfy/ldm/modules/diffusionmodules/openaimodel.py b/comfy/ldm/modules/diffusionmodules/openaimodel.py
index 3ce3c2e7b..b42637c82 100644
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -608,6 +608,7 @@ class UNetModel(nn.Module):
         """
         transformer_options["original_shape"] = list(x.shape)
         transformer_options["current_index"] = 0
+        transformer_patches = transformer_options.get("patches", {})
 
         assert (y is not None) == (
             self.num_classes is not None
@@ -644,6 +645,11 @@ class UNetModel(nn.Module):
                 if ctrl is not None:
                     hsp += ctrl
 
+            if "output_block_patch" in transformer_patches:
+                patch = transformer_patches["output_block_patch"]
+                for p in patch:
+                    h, hsp = p(h, hsp, transformer_options)
+
             h = th.cat([h, hsp], dim=1)
             del hsp
             if len(hs) > 0:
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 372d5a2df..787c78575 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1,5 +1,5 @@
-
-from . import supported_models
+import comfy.supported_models
+import comfy.supported_models_base
 
 def count_blocks(state_dict_keys, prefix_string):
     count = 0
@@ -109,17 +109,20 @@ def detect_unet_config(state_dict, key_prefix, use_fp16):
     return unet_config
 
 def model_config_from_unet_config(unet_config):
-    for model_config in supported_models.models:
+    for model_config in comfy.supported_models.models:
         if model_config.matches(unet_config):
             return model_config(unet_config)
 
     print("no match", unet_config)
     return None
 
-def model_config_from_unet(state_dict, unet_key_prefix, use_fp16):
+def model_config_from_unet(state_dict, unet_key_prefix, use_fp16, use_base_if_no_match=False):
     unet_config = detect_unet_config(state_dict, unet_key_prefix, use_fp16)
-    return model_config_from_unet_config(unet_config)
-
+    model_config = model_config_from_unet_config(unet_config)
+    if model_config is None and use_base_if_no_match:
+        return comfy.supported_models_base.BASE(unet_config)
+    else:
+        return model_config
 
 def unet_config_from_diffusers_unet(state_dict, use_fp16):
     match = {}
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 1050c13a4..8b8963726 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -488,6 +488,8 @@ def cast_to_device(tensor, device, dtype, copy=False):
     elif tensor.dtype == torch.bfloat16:
         if hasattr(device, 'type') and device.type.startswith("cuda"):
             device_supports_cast = True
+        elif is_intel_xpu():
+            device_supports_cast = True
 
     if device_supports_cast:
         if copy:
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 10551656e..ba505221e 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -88,6 +88,9 @@ class ModelPatcher:
     def set_model_attn2_output_patch(self, patch):
         self.set_model_patch(patch, "attn2_output_patch")
 
+    def set_model_output_block_patch(self, patch):
+        self.set_model_patch(patch, "output_block_patch")
+
     def model_patches_to(self, device):
         to = self.model_options["transformer_options"]
         if "patches" in to:
diff --git a/comfy/sample.py b/comfy/sample.py
index e4730b189..322272766 100644
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -70,28 +70,44 @@ def cleanup_additional_models(models):
         if hasattr(m, 'cleanup'):
             m.cleanup()
 
-def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
-    device = comfy.model_management.get_torch_device()
+def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
+    device = model.load_device
 
     if noise_mask is not None:
-        noise_mask = prepare_mask(noise_mask, noise.shape, device)
+        noise_mask = prepare_mask(noise_mask, noise_shape, device)
 
     real_model = None
     models, inference_memory = get_additional_models(positive, negative, model.model_dtype())
-    comfy.model_management.load_models_gpu([model] + models, comfy.model_management.batch_area_memory(noise.shape[0] * noise.shape[2] * noise.shape[3]) + inference_memory)
+    comfy.model_management.load_models_gpu([model] + models, comfy.model_management.batch_area_memory(noise_shape[0] * noise_shape[2] * noise_shape[3]) + inference_memory)
     real_model = model.model
 
-    noise = noise.to(device)
-    latent_image = latent_image.to(device)
-
-    positive_copy = broadcast_cond(positive, noise.shape[0], device)
-    negative_copy = broadcast_cond(negative, noise.shape[0], device)
+    positive_copy = broadcast_cond(positive, noise_shape[0], device)
+    negative_copy = broadcast_cond(negative, noise_shape[0], device)
+    return real_model, positive_copy, negative_copy, noise_mask, models
 
 
-    sampler = comfy.samplers.KSampler(real_model, steps=steps, device=device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
+def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
+    real_model, positive_copy, negative_copy, noise_mask, models = prepare_sampling(model, noise.shape, positive, negative, noise_mask)
+
+    noise = noise.to(model.load_device)
+    latent_image = latent_image.to(model.load_device)
+
+    sampler = comfy.samplers.KSampler(real_model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
 
     samples = sampler.sample(noise, positive_copy, negative_copy, cfg=cfg, latent_image=latent_image, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed)
     samples = samples.cpu()
 
     cleanup_additional_models(models)
     return samples
+
+def sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=None, callback=None, disable_pbar=False, seed=None):
+    real_model, positive_copy, negative_copy, noise_mask, models = prepare_sampling(model, noise.shape, positive, negative, noise_mask)
+    noise = noise.to(model.load_device)
+    latent_image = latent_image.to(model.load_device)
+    sigmas = sigmas.to(model.load_device)
+
+    samples = comfy.samplers.sample(real_model, noise, positive_copy, negative_copy, cfg, model.load_device, sampler, sigmas, model_options=model.model_options, latent_image=latent_image, denoise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
+    samples = samples.cpu()
+    cleanup_additional_models(models)
+    return samples
+
diff --git a/comfy/samplers.py b/comfy/samplers.py
index e3192ca58..e43f7a6fe 100644
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -544,21 +544,190 @@ def encode_adm(model, conds, batch_size, width, height, device, prompt_type):
     return conds
 
 
+class Sampler:
+    def sample(self):
+        pass
+
+    def max_denoise(self, model_wrap, sigmas):
+        return math.isclose(float(model_wrap.sigma_max), float(sigmas[0]), rel_tol=1e-05)
+
+class DDIM(Sampler):
+    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+        timesteps = []
+        for s in range(sigmas.shape[0]):
+            timesteps.insert(0, model_wrap.sigma_to_discrete_timestep(sigmas[s]))
+        noise_mask = None
+        if denoise_mask is not None:
+            noise_mask = 1.0 - denoise_mask
+
+        ddim_callback = None
+        if callback is not None:
+            total_steps = len(timesteps) - 1
+            ddim_callback = lambda pred_x0, i: callback(i, pred_x0, None, total_steps)
+
+        max_denoise = self.max_denoise(model_wrap, sigmas)
+
+        ddim_sampler = DDIMSampler(model_wrap.inner_model.inner_model, device=noise.device)
+        ddim_sampler.make_schedule_timesteps(ddim_timesteps=timesteps, verbose=False)
+        z_enc = ddim_sampler.stochastic_encode(latent_image, torch.tensor([len(timesteps) - 1] * noise.shape[0]).to(noise.device), noise=noise, max_denoise=max_denoise)
+        samples, _ = ddim_sampler.sample_custom(ddim_timesteps=timesteps,
+                                                batch_size=noise.shape[0],
+                                                shape=noise.shape[1:],
+                                                verbose=False,
+                                                eta=0.0,
+                                                x_T=z_enc,
+                                                x0=latent_image,
+                                                img_callback=ddim_callback,
+                                                denoise_function=model_wrap.predict_eps_discrete_timestep,
+                                                extra_args=extra_args,
+                                                mask=noise_mask,
+                                                to_zero=sigmas[-1]==0,
+                                                end_step=sigmas.shape[0] - 1,
+                                                disable_pbar=disable_pbar)
+        return samples
+
+class UNIPC(Sampler):
+    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, disable=disable_pbar)
+
+class UNIPCBH2(Sampler):
+    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, variant='bh2', disable=disable_pbar)
+
+KSAMPLER_NAMES = ["euler", "euler_ancestral", "heun", "dpm_2", "dpm_2_ancestral",
+                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu",
+                  "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm"]
+
+def ksampler(sampler_name, extra_options={}):
+    class KSAMPLER(Sampler):
+        def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+            extra_args["denoise_mask"] = denoise_mask
+            model_k = KSamplerX0Inpaint(model_wrap)
+            model_k.latent_image = latent_image
+            model_k.noise = noise
+
+            if self.max_denoise(model_wrap, sigmas):
+                noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0)
+            else:
+                noise = noise * sigmas[0]
+
+            k_callback = None
+            total_steps = len(sigmas) - 1
+            if callback is not None:
+                k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps)
+
+            sigma_min = sigmas[-1]
+            if sigma_min == 0:
+                sigma_min = sigmas[-2]
+
+            if latent_image is not None:
+                noise += latent_image
+            if sampler_name == "dpm_fast":
+                samples = k_diffusion_sampling.sample_dpm_fast(model_k, noise, sigma_min, sigmas[0], total_steps, extra_args=extra_args, callback=k_callback, disable=disable_pbar)
+            elif sampler_name == "dpm_adaptive":
+                samples = k_diffusion_sampling.sample_dpm_adaptive(model_k, noise, sigma_min, sigmas[0], extra_args=extra_args, callback=k_callback, disable=disable_pbar)
+            else:
+                samples = getattr(k_diffusion_sampling, "sample_{}".format(sampler_name))(model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar, **extra_options)
+            return samples
+    return KSAMPLER
+
+def wrap_model(model):
+    model_denoise = CFGNoisePredictor(model)
+    if model.model_type == model_base.ModelType.V_PREDICTION:
+        model_wrap = CompVisVDenoiser(model_denoise, quantize=True)
+    else:
+        model_wrap = k_diffusion_external.CompVisDenoiser(model_denoise, quantize=True)
+    return model_wrap
+
+def sample(model, noise, positive, negative, cfg, device, sampler, sigmas, model_options={}, latent_image=None, denoise_mask=None, callback=None, disable_pbar=False, seed=None):
+    positive = positive[:]
+    negative = negative[:]
+
+    resolve_areas_and_cond_masks(positive, noise.shape[2], noise.shape[3], device)
+    resolve_areas_and_cond_masks(negative, noise.shape[2], noise.shape[3], device)
+
+    model_wrap = wrap_model(model)
+
+    calculate_start_end_timesteps(model_wrap, negative)
+    calculate_start_end_timesteps(model_wrap, positive)
+
+    #make sure each cond area has an opposite one with the same area
+    for c in positive:
+        create_cond_with_same_area_if_none(negative, c)
+    for c in negative:
+        create_cond_with_same_area_if_none(positive, c)
+
+    pre_run_control(model_wrap, negative + positive)
+
+    apply_empty_x_to_equal_area(list(filter(lambda c: c[1].get('control_apply_to_uncond', False) == True, positive)), negative, 'control', lambda cond_cnets, x: cond_cnets[x])
+    apply_empty_x_to_equal_area(positive, negative, 'gligen', lambda cond_cnets, x: cond_cnets[x])
+
+    if model.is_adm():
+        positive = encode_adm(model, positive, noise.shape[0], noise.shape[3], noise.shape[2], device, "positive")
+        negative = encode_adm(model, negative, noise.shape[0], noise.shape[3], noise.shape[2], device, "negative")
+
+    if latent_image is not None:
+        latent_image = model.process_latent_in(latent_image)
+
+    extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": model_options, "seed":seed}
+
+    cond_concat = None
+    if hasattr(model, 'concat_keys'): #inpaint
+        cond_concat = []
+        for ck in model.concat_keys:
+            if denoise_mask is not None:
+                if ck == "mask":
+                    cond_concat.append(denoise_mask[:,:1])
+                elif ck == "masked_image":
+                    cond_concat.append(latent_image) #NOTE: the latent_image should be masked by the mask in pixel space
+            else:
+                if ck == "mask":
+                    cond_concat.append(torch.ones_like(noise)[:,:1])
+                elif ck == "masked_image":
+                    cond_concat.append(blank_inpaint_image_like(noise))
+        extra_args["cond_concat"] = cond_concat
+
+    samples = sampler.sample(model_wrap, sigmas, extra_args, callback, noise, latent_image, denoise_mask, disable_pbar)
+    return model.process_latent_out(samples.to(torch.float32))
+
+SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform"]
+SAMPLER_NAMES = KSAMPLER_NAMES + ["ddim", "uni_pc", "uni_pc_bh2"]
+
+def calculate_sigmas_scheduler(model, scheduler_name, steps):
+    model_wrap = wrap_model(model)
+    if scheduler_name == "karras":
+        sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=float(model_wrap.sigma_min), sigma_max=float(model_wrap.sigma_max))
+    elif scheduler_name == "exponential":
+        sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=float(model_wrap.sigma_min), sigma_max=float(model_wrap.sigma_max))
+    elif scheduler_name == "normal":
+        sigmas = model_wrap.get_sigmas(steps)
+    elif scheduler_name == "simple":
+        sigmas = simple_scheduler(model_wrap, steps)
+    elif scheduler_name == "ddim_uniform":
+        sigmas = ddim_scheduler(model_wrap, steps)
+    elif scheduler_name == "sgm_uniform":
+        sigmas = sgm_scheduler(model_wrap, steps)
+    else:
+        print("error invalid scheduler", self.scheduler)
+    return sigmas
+
+def sampler_class(name):
+    if name == "uni_pc":
+        sampler = UNIPC
+    elif name == "uni_pc_bh2":
+        sampler = UNIPCBH2
+    elif name == "ddim":
+        sampler = DDIM
+    else:
+        sampler = ksampler(name)
+    return sampler
+
 class KSampler:
-    SCHEDULERS = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform"]
-    SAMPLERS = ["euler", "euler_ancestral", "heun", "dpm_2", "dpm_2_ancestral",
-                "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu",
-                "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "ddim", "uni_pc", "uni_pc_bh2"]
+    SCHEDULERS = SCHEDULER_NAMES
+    SAMPLERS = SAMPLER_NAMES
 
     def __init__(self, model, steps, device, sampler=None, scheduler=None, denoise=None, model_options={}):
         self.model = model
-        self.model_denoise = CFGNoisePredictor(self.model)
-        if self.model.model_type == model_base.ModelType.V_PREDICTION:
-            self.model_wrap = CompVisVDenoiser(self.model_denoise, quantize=True)
-        else:
-            self.model_wrap = k_diffusion_external.CompVisDenoiser(self.model_denoise, quantize=True)
-
-        self.model_k = KSamplerX0Inpaint(self.model_wrap)
         self.device = device
         if scheduler not in self.SCHEDULERS:
             scheduler = self.SCHEDULERS[0]
@@ -566,8 +735,6 @@ class KSampler:
             sampler = self.SAMPLERS[0]
         self.scheduler = scheduler
         self.sampler = sampler
-        self.sigma_min=float(self.model_wrap.sigma_min)
-        self.sigma_max=float(self.model_wrap.sigma_max)
         self.set_steps(steps, denoise)
         self.denoise = denoise
         self.model_options = model_options
@@ -580,20 +747,7 @@ class KSampler:
             steps += 1
             discard_penultimate_sigma = True
 
-        if self.scheduler == "karras":
-            sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=self.sigma_min, sigma_max=self.sigma_max)
-        elif self.scheduler == "exponential":
-            sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=self.sigma_min, sigma_max=self.sigma_max)
-        elif self.scheduler == "normal":
-            sigmas = self.model_wrap.get_sigmas(steps)
-        elif self.scheduler == "simple":
-            sigmas = simple_scheduler(self.model_wrap, steps)
-        elif self.scheduler == "ddim_uniform":
-            sigmas = ddim_scheduler(self.model_wrap, steps)
-        elif self.scheduler == "sgm_uniform":
-            sigmas = sgm_scheduler(self.model_wrap, steps)
-        else:
-            print("error invalid scheduler", self.scheduler)
+        sigmas = calculate_sigmas_scheduler(self.model, self.scheduler, steps)
 
         if discard_penultimate_sigma:
             sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
@@ -611,10 +765,8 @@ class KSampler:
     def sample(self, noise, positive, negative, cfg, latent_image=None, start_step=None, last_step=None, force_full_denoise=False, denoise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
         if sigmas is None:
             sigmas = self.sigmas
-        sigma_min = self.sigma_min
 
         if last_step is not None and last_step < (len(sigmas) - 1):
-            sigma_min = sigmas[last_step]
             sigmas = sigmas[:last_step + 1]
             if force_full_denoise:
                 sigmas[-1] = 0
@@ -628,117 +780,6 @@ class KSampler:
                 else:
                     return torch.zeros_like(noise)
 
-        positive = positive[:]
-        negative = negative[:]
+        sampler = sampler_class(self.sampler)
 
-        resolve_areas_and_cond_masks(positive, noise.shape[2], noise.shape[3], self.device)
-        resolve_areas_and_cond_masks(negative, noise.shape[2], noise.shape[3], self.device)
-
-        calculate_start_end_timesteps(self.model_wrap, negative)
-        calculate_start_end_timesteps(self.model_wrap, positive)
-
-        #make sure each cond area has an opposite one with the same area
-        for c in positive:
-            create_cond_with_same_area_if_none(negative, c)
-        for c in negative:
-            create_cond_with_same_area_if_none(positive, c)
-
-        pre_run_control(self.model_wrap, negative + positive)
-
-        apply_empty_x_to_equal_area(list(filter(lambda c: c[1].get('control_apply_to_uncond', False) == True, positive)), negative, 'control', lambda cond_cnets, x: cond_cnets[x])
-        apply_empty_x_to_equal_area(positive, negative, 'gligen', lambda cond_cnets, x: cond_cnets[x])
-
-        if self.model.is_adm():
-            positive = encode_adm(self.model, positive, noise.shape[0], noise.shape[3], noise.shape[2], self.device, "positive")
-            negative = encode_adm(self.model, negative, noise.shape[0], noise.shape[3], noise.shape[2], self.device, "negative")
-
-        if latent_image is not None:
-            latent_image = self.model.process_latent_in(latent_image)
-
-        extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": self.model_options, "seed":seed}
-
-        cond_concat = None
-        if hasattr(self.model, 'concat_keys'): #inpaint
-            cond_concat = []
-            for ck in self.model.concat_keys:
-                if denoise_mask is not None:
-                    if ck == "mask":
-                        cond_concat.append(denoise_mask[:,:1])
-                    elif ck == "masked_image":
-                        cond_concat.append(latent_image) #NOTE: the latent_image should be masked by the mask in pixel space
-                else:
-                    if ck == "mask":
-                        cond_concat.append(torch.ones_like(noise)[:,:1])
-                    elif ck == "masked_image":
-                        cond_concat.append(blank_inpaint_image_like(noise))
-            extra_args["cond_concat"] = cond_concat
-
-        if sigmas[0] != self.sigmas[0] or (self.denoise is not None and self.denoise < 1.0):
-            max_denoise = False
-        else:
-            max_denoise = True
-
-
-        if self.sampler == "uni_pc":
-            samples = uni_pc.sample_unipc(self.model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=max_denoise, extra_args=extra_args, noise_mask=denoise_mask, callback=callback, disable=disable_pbar)
-        elif self.sampler == "uni_pc_bh2":
-            samples = uni_pc.sample_unipc(self.model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=max_denoise, extra_args=extra_args, noise_mask=denoise_mask, callback=callback, variant='bh2', disable=disable_pbar)
-        elif self.sampler == "ddim":
-            timesteps = []
-            for s in range(sigmas.shape[0]):
-                timesteps.insert(0, self.model_wrap.sigma_to_discrete_timestep(sigmas[s]))
-            noise_mask = None
-            if denoise_mask is not None:
-                noise_mask = 1.0 - denoise_mask
-
-            ddim_callback = None
-            if callback is not None:
-                total_steps = len(timesteps) - 1
-                ddim_callback = lambda pred_x0, i: callback(i, pred_x0, None, total_steps)
-
-            sampler = DDIMSampler(self.model, device=self.device)
-            sampler.make_schedule_timesteps(ddim_timesteps=timesteps, verbose=False)
-            z_enc = sampler.stochastic_encode(latent_image, torch.tensor([len(timesteps) - 1] * noise.shape[0]).to(self.device), noise=noise, max_denoise=max_denoise)
-            samples, _ = sampler.sample_custom(ddim_timesteps=timesteps,
-                                                    conditioning=positive,
-                                                    batch_size=noise.shape[0],
-                                                    shape=noise.shape[1:],
-                                                    verbose=False,
-                                                    unconditional_guidance_scale=cfg,
-                                                    unconditional_conditioning=negative,
-                                                    eta=0.0,
-                                                    x_T=z_enc,
-                                                    x0=latent_image,
-                                                    img_callback=ddim_callback,
-                                                    denoise_function=self.model_wrap.predict_eps_discrete_timestep,
-                                                    extra_args=extra_args,
-                                                    mask=noise_mask,
-                                                    to_zero=sigmas[-1]==0,
-                                                    end_step=sigmas.shape[0] - 1,
-                                                    disable_pbar=disable_pbar)
-
-        else:
-            extra_args["denoise_mask"] = denoise_mask
-            self.model_k.latent_image = latent_image
-            self.model_k.noise = noise
-
-            if max_denoise:
-                noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0)
-            else:
-                noise = noise * sigmas[0]
-
-            k_callback = None
-            total_steps = len(sigmas) - 1
-            if callback is not None:
-                k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps)
-
-            if latent_image is not None:
-                noise += latent_image
-            if self.sampler == "dpm_fast":
-                samples = k_diffusion_sampling.sample_dpm_fast(self.model_k, noise, sigma_min, sigmas[0], total_steps, extra_args=extra_args, callback=k_callback, disable=disable_pbar)
-            elif self.sampler == "dpm_adaptive":
-                samples = k_diffusion_sampling.sample_dpm_adaptive(self.model_k, noise, sigma_min, sigmas[0], extra_args=extra_args, callback=k_callback, disable=disable_pbar)
-            else:
-                samples = getattr(k_diffusion_sampling, "sample_{}".format(self.sampler))(self.model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar)
-
-        return self.model.process_latent_out(samples.to(torch.float32))
+        return sample(self.model, noise, positive, negative, cfg, self.device, sampler(), sigmas, self.model_options, latent_image=latent_image, denoise_mask=denoise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
diff --git a/comfy/sd.py b/comfy/sd.py
index 9bdb2ad64..cfd6fb3cb 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -152,7 +152,9 @@ class VAE:
             sd = comfy.utils.load_torch_file(ckpt_path)
             if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
                 sd = diffusers_convert.convert_vae_state_dict(sd)
-            self.first_stage_model.load_state_dict(sd, strict=False)
+            m, u = self.first_stage_model.load_state_dict(sd, strict=False)
+            if len(m) > 0:
+                print("Missing VAE keys", m)
 
         if device is None:
             device = model_management.vae_device()
@@ -181,7 +183,7 @@ class VAE:
         steps += pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x * 2, tile_y // 2, overlap)
         pbar = comfy.utils.ProgressBar(steps)
 
-        encode_fn = lambda a: self.first_stage_model.encode(2. * a.to(self.vae_dtype).to(self.device) - 1.).sample().float()
+        encode_fn = lambda a: self.first_stage_model.encode((2. * a - 1.).to(self.vae_dtype).to(self.device)).sample().float()
         samples = comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x, tile_y, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
         samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
         samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = (1/8), out_channels=4, pbar=pbar)
@@ -200,7 +202,7 @@ class VAE:
             pixel_samples = torch.empty((samples_in.shape[0], 3, round(samples_in.shape[2] * 8), round(samples_in.shape[3] * 8)), device="cpu")
             for x in range(0, samples_in.shape[0], batch_number):
                 samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                pixel_samples[x:x+batch_number] = torch.clamp((self.first_stage_model.decode(samples) + 1.0) / 2.0, min=0.0, max=1.0).cpu().float()
+                pixel_samples[x:x+batch_number] = torch.clamp((self.first_stage_model.decode(samples).cpu().float() + 1.0) / 2.0, min=0.0, max=1.0)
         except model_management.OOM_EXCEPTION as e:
             print("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
             pixel_samples = self.decode_tiled_(samples_in)
@@ -392,13 +394,14 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
 
     return (comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=offload_device), clip, vae)
 
-def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None):
+def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True):
     sd = comfy.utils.load_torch_file(ckpt_path)
     sd_keys = sd.keys()
     clip = None
     clipvision = None
     vae = None
     model = None
+    model_patcher = None
     clip_target = None
 
     parameters = comfy.utils.calculate_parameters(sd, "model.diffusion_model.")
@@ -419,10 +422,11 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
     if fp16:
         dtype = torch.float16
 
-    inital_load_device = model_management.unet_inital_load_device(parameters, dtype)
-    offload_device = model_management.unet_offload_device()
-    model = model_config.get_model(sd, "model.diffusion_model.", device=inital_load_device)
-    model.load_model_weights(sd, "model.diffusion_model.")
+    if output_model:
+        inital_load_device = model_management.unet_inital_load_device(parameters, dtype)
+        offload_device = model_management.unet_offload_device()
+        model = model_config.get_model(sd, "model.diffusion_model.", device=inital_load_device)
+        model.load_model_weights(sd, "model.diffusion_model.")
 
     if output_vae:
         vae = VAE()
@@ -442,10 +446,11 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
     if len(left_over) > 0:
         print("left over keys:", left_over)
 
-    model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device(), current_device=inital_load_device)
-    if inital_load_device != torch.device("cpu"):
-        print("loaded straight to GPU")
-        model_management.load_model_gpu(model_patcher)
+    if output_model:
+        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device(), current_device=inital_load_device)
+        if inital_load_device != torch.device("cpu"):
+            print("loaded straight to GPU")
+            model_management.load_model_gpu(model_patcher)
 
     return (model_patcher, clip, vae, clipvision)
 
diff --git a/comfy_extras/nodes_compositing.py b/comfy_extras/nodes_compositing.py
new file mode 100644
index 000000000..181b36ed6
--- /dev/null
+++ b/comfy_extras/nodes_compositing.py
@@ -0,0 +1,202 @@
+import numpy as np
+import torch
+import comfy.utils
+from enum import Enum
+
+def resize_mask(mask, shape):
+    return torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[0], shape[1]), mode="bilinear").squeeze(1)
+
+class PorterDuffMode(Enum):
+    ADD = 0
+    CLEAR = 1
+    DARKEN = 2
+    DST = 3
+    DST_ATOP = 4
+    DST_IN = 5
+    DST_OUT = 6
+    DST_OVER = 7
+    LIGHTEN = 8
+    MULTIPLY = 9
+    OVERLAY = 10
+    SCREEN = 11
+    SRC = 12
+    SRC_ATOP = 13
+    SRC_IN = 14
+    SRC_OUT = 15
+    SRC_OVER = 16
+    XOR = 17
+
+
+def porter_duff_composite(src_image: torch.Tensor, src_alpha: torch.Tensor, dst_image: torch.Tensor, dst_alpha: torch.Tensor, mode: PorterDuffMode):
+    if mode == PorterDuffMode.ADD:
+        out_alpha = torch.clamp(src_alpha + dst_alpha, 0, 1)
+        out_image = torch.clamp(src_image + dst_image, 0, 1)
+    elif mode == PorterDuffMode.CLEAR:
+        out_alpha = torch.zeros_like(dst_alpha)
+        out_image = torch.zeros_like(dst_image)
+    elif mode == PorterDuffMode.DARKEN:
+        out_alpha = src_alpha + dst_alpha  - src_alpha * dst_alpha
+        out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image + torch.min(src_image, dst_image)
+    elif mode == PorterDuffMode.DST:
+        out_alpha = dst_alpha
+        out_image = dst_image
+    elif mode == PorterDuffMode.DST_ATOP:
+        out_alpha = src_alpha
+        out_image = src_alpha * dst_image + (1 - dst_alpha) * src_image
+    elif mode == PorterDuffMode.DST_IN:
+        out_alpha = src_alpha * dst_alpha
+        out_image = dst_image * src_alpha
+    elif mode == PorterDuffMode.DST_OUT:
+        out_alpha = (1 - src_alpha) * dst_alpha
+        out_image = (1 - src_alpha) * dst_image
+    elif mode == PorterDuffMode.DST_OVER:
+        out_alpha = dst_alpha + (1 - dst_alpha) * src_alpha
+        out_image = dst_image + (1 - dst_alpha) * src_image
+    elif mode == PorterDuffMode.LIGHTEN:
+        out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha
+        out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image + torch.max(src_image, dst_image)
+    elif mode == PorterDuffMode.MULTIPLY:
+        out_alpha = src_alpha * dst_alpha
+        out_image = src_image * dst_image
+    elif mode == PorterDuffMode.OVERLAY:
+        out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha
+        out_image = torch.where(2 * dst_image < dst_alpha, 2 * src_image * dst_image,
+            src_alpha * dst_alpha - 2 * (dst_alpha - src_image) * (src_alpha - dst_image))
+    elif mode == PorterDuffMode.SCREEN:
+        out_alpha = src_alpha + dst_alpha - src_alpha * dst_alpha
+        out_image = src_image + dst_image - src_image * dst_image
+    elif mode == PorterDuffMode.SRC:
+        out_alpha = src_alpha
+        out_image = src_image
+    elif mode == PorterDuffMode.SRC_ATOP:
+        out_alpha = dst_alpha
+        out_image = dst_alpha * src_image + (1 - src_alpha) * dst_image
+    elif mode == PorterDuffMode.SRC_IN:
+        out_alpha = src_alpha * dst_alpha
+        out_image = src_image * dst_alpha
+    elif mode == PorterDuffMode.SRC_OUT:
+        out_alpha = (1 - dst_alpha) * src_alpha
+        out_image = (1 - dst_alpha) * src_image
+    elif mode == PorterDuffMode.SRC_OVER:
+        out_alpha = src_alpha + (1 - src_alpha) * dst_alpha
+        out_image = src_image + (1 - src_alpha) * dst_image
+    elif mode == PorterDuffMode.XOR:
+        out_alpha = (1 - dst_alpha) * src_alpha + (1 - src_alpha) * dst_alpha
+        out_image = (1 - dst_alpha) * src_image + (1 - src_alpha) * dst_image
+    else:
+        out_alpha = None
+        out_image = None
+    return out_image, out_alpha
+
+
+class PorterDuffImageComposite:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "source": ("IMAGE",),
+                "source_alpha": ("MASK",),
+                "destination": ("IMAGE",),
+                "destination_alpha": ("MASK",),
+                "mode": ([mode.name for mode in PorterDuffMode], {"default": PorterDuffMode.DST.name}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE", "MASK")
+    FUNCTION = "composite"
+    CATEGORY = "mask/compositing"
+
+    def composite(self, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode):
+        batch_size = min(len(source), len(source_alpha), len(destination), len(destination_alpha))
+        out_images = []
+        out_alphas = []
+
+        for i in range(batch_size):
+            src_image = source[i]
+            dst_image = destination[i]
+
+            assert src_image.shape[2] == dst_image.shape[2] # inputs need to have same number of channels
+
+            src_alpha = source_alpha[i].unsqueeze(2)
+            dst_alpha = destination_alpha[i].unsqueeze(2)
+
+            if dst_alpha.shape[:2] != dst_image.shape[:2]:
+                upscale_input = dst_alpha.unsqueeze(0).permute(0, 3, 1, 2)
+                upscale_output = comfy.utils.common_upscale(upscale_input, dst_image.shape[1], dst_image.shape[0], upscale_method='bicubic', crop='center')
+                dst_alpha = upscale_output.permute(0, 2, 3, 1).squeeze(0)
+            if src_image.shape != dst_image.shape:
+                upscale_input = src_image.unsqueeze(0).permute(0, 3, 1, 2)
+                upscale_output = comfy.utils.common_upscale(upscale_input, dst_image.shape[1], dst_image.shape[0], upscale_method='bicubic', crop='center')
+                src_image = upscale_output.permute(0, 2, 3, 1).squeeze(0)
+            if src_alpha.shape != dst_alpha.shape:
+                upscale_input = src_alpha.unsqueeze(0).permute(0, 3, 1, 2)
+                upscale_output = comfy.utils.common_upscale(upscale_input, dst_alpha.shape[1], dst_alpha.shape[0], upscale_method='bicubic', crop='center')
+                src_alpha = upscale_output.permute(0, 2, 3, 1).squeeze(0)
+
+            out_image, out_alpha = porter_duff_composite(src_image, src_alpha, dst_image, dst_alpha, PorterDuffMode[mode])
+
+            out_images.append(out_image)
+            out_alphas.append(out_alpha.squeeze(2))
+
+        result = (torch.stack(out_images), torch.stack(out_alphas))
+        return result
+
+
+class SplitImageWithAlpha:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+                "required": {
+                    "image": ("IMAGE",),
+                }
+        }
+
+    CATEGORY = "mask/compositing"
+    RETURN_TYPES = ("IMAGE", "MASK")
+    FUNCTION = "split_image_with_alpha"
+
+    def split_image_with_alpha(self, image: torch.Tensor):
+        out_images = [i[:,:,:3] for i in image]
+        out_alphas = [i[:,:,3] if i.shape[2] > 3 else torch.ones_like(i[:,:,0]) for i in image]
+        result = (torch.stack(out_images), 1.0 - torch.stack(out_alphas))
+        return result
+
+
+class JoinImageWithAlpha:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+                "required": {
+                    "image": ("IMAGE",),
+                    "alpha": ("MASK",),
+                }
+        }
+
+    CATEGORY = "mask/compositing"
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "join_image_with_alpha"
+
+    def join_image_with_alpha(self, image: torch.Tensor, alpha: torch.Tensor):
+        batch_size = min(len(image), len(alpha))
+        out_images = []
+
+        alpha = 1.0 - resize_mask(alpha, image.shape[1:])
+        for i in range(batch_size):
+           out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2))
+
+        result = (torch.stack(out_images),)
+        return result
+
+
+NODE_CLASS_MAPPINGS = {
+    "PorterDuffImageComposite": PorterDuffImageComposite,
+    "SplitImageWithAlpha": SplitImageWithAlpha,
+    "JoinImageWithAlpha": JoinImageWithAlpha,
+}
+
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "PorterDuffImageComposite": "Porter-Duff Image Composite",
+    "SplitImageWithAlpha": "Split Image with Alpha",
+    "JoinImageWithAlpha": "Join Image with Alpha",
+}
diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
new file mode 100644
index 000000000..9391c7147
--- /dev/null
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -0,0 +1,245 @@
+import comfy.samplers
+import comfy.sample
+from comfy.k_diffusion import sampling as k_diffusion_sampling
+import latent_preview
+import torch
+
+
+class BasicScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"model": ("MODEL",),
+                     "scheduler": (comfy.samplers.SCHEDULER_NAMES, ),
+                     "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                      }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, model, scheduler, steps):
+        sigmas = comfy.samplers.calculate_sigmas_scheduler(model.model, scheduler, steps).cpu()
+        return (sigmas, )
+
+
+class KarrasScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "rho": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                    }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, steps, sigma_max, sigma_min, rho):
+        sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho)
+        return (sigmas, )
+
+class ExponentialScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                    }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, steps, sigma_max, sigma_min):
+        sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max)
+        return (sigmas, )
+
+class PolyexponentialScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "rho": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                    }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, steps, sigma_max, sigma_min, rho):
+        sigmas = k_diffusion_sampling.get_sigmas_polyexponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho)
+        return (sigmas, )
+
+class VPScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                     "beta_d": ("FLOAT", {"default": 19.9, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}), #TODO: fix default values
+                     "beta_min": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1000.0, "step":0.01, "round": False}),
+                     "eps_s": ("FLOAT", {"default": 0.001, "min": 0.0, "max": 1.0, "step":0.0001, "round": False}),
+                    }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, steps, beta_d, beta_min, eps_s):
+        sigmas = k_diffusion_sampling.get_sigmas_vp(n=steps, beta_d=beta_d, beta_min=beta_min, eps_s=eps_s)
+        return (sigmas, )
+
+class SplitSigmas:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"sigmas": ("SIGMAS", ),
+                    "step": ("INT", {"default": 0, "min": 0, "max": 10000}),
+                     }
+                }
+    RETURN_TYPES = ("SIGMAS","SIGMAS")
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, sigmas, step):
+        sigmas1 = sigmas[:step + 1]
+        sigmas2 = sigmas[step:]
+        return (sigmas1, sigmas2)
+
+class KSamplerSelect:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"sampler_name": (comfy.samplers.SAMPLER_NAMES, ),
+                      }
+               }
+    RETURN_TYPES = ("SAMPLER",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sampler"
+
+    def get_sampler(self, sampler_name):
+        sampler = comfy.samplers.sampler_class(sampler_name)()
+        return (sampler, )
+
+class SamplerDPMPP_2M_SDE:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"solver_type": (['midpoint', 'heun'], ),
+                     "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                     "noise_device": (['gpu', 'cpu'], ),
+                      }
+               }
+    RETURN_TYPES = ("SAMPLER",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sampler"
+
+    def get_sampler(self, solver_type, eta, s_noise, noise_device):
+        if noise_device == 'cpu':
+            sampler_name = "dpmpp_2m_sde"
+        else:
+            sampler_name = "dpmpp_2m_sde_gpu"
+        sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "solver_type": solver_type})()
+        return (sampler, )
+
+
+class SamplerDPMPP_SDE:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                     "r": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
+                     "noise_device": (['gpu', 'cpu'], ),
+                      }
+               }
+    RETURN_TYPES = ("SAMPLER",)
+    CATEGORY = "sampling/custom_sampling"
+
+    FUNCTION = "get_sampler"
+
+    def get_sampler(self, eta, s_noise, r, noise_device):
+        if noise_device == 'cpu':
+            sampler_name = "dpmpp_sde"
+        else:
+            sampler_name = "dpmpp_sde_gpu"
+        sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "r": r})()
+        return (sampler, )
+
+class SamplerCustom:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"model": ("MODEL",),
+                    "add_noise": ("BOOLEAN", {"default": True}),
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.5, "round": 0.01}),
+                    "positive": ("CONDITIONING", ),
+                    "negative": ("CONDITIONING", ),
+                    "sampler": ("SAMPLER", ),
+                    "sigmas": ("SIGMAS", ),
+                    "latent_image": ("LATENT", ),
+                     }
+                }
+
+    RETURN_TYPES = ("LATENT","LATENT")
+    RETURN_NAMES = ("output", "denoised_output")
+
+    FUNCTION = "sample"
+
+    CATEGORY = "sampling/custom_sampling"
+
+    def sample(self, model, add_noise, noise_seed, cfg, positive, negative, sampler, sigmas, latent_image):
+        latent = latent_image
+        latent_image = latent["samples"]
+        if not add_noise:
+            noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
+        else:
+            batch_inds = latent["batch_index"] if "batch_index" in latent else None
+            noise = comfy.sample.prepare_noise(latent_image, noise_seed, batch_inds)
+
+        noise_mask = None
+        if "noise_mask" in latent:
+            noise_mask = latent["noise_mask"]
+
+        x0_output = {}
+        callback = latent_preview.prepare_callback(model, sigmas.shape[-1] - 1, x0_output)
+
+        disable_pbar = False
+        samples = comfy.sample.sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=noise_seed)
+
+        out = latent.copy()
+        out["samples"] = samples
+        if "x0" in x0_output:
+            out_denoised = latent.copy()
+            out_denoised["samples"] = model.model.process_latent_out(x0_output["x0"].cpu())
+        else:
+            out_denoised = out
+        return (out, out_denoised)
+
+NODE_CLASS_MAPPINGS = {
+    "SamplerCustom": SamplerCustom,
+    "KarrasScheduler": KarrasScheduler,
+    "ExponentialScheduler": ExponentialScheduler,
+    "PolyexponentialScheduler": PolyexponentialScheduler,
+    "VPScheduler": VPScheduler,
+    "KSamplerSelect": KSamplerSelect,
+    "SamplerDPMPP_2M_SDE": SamplerDPMPP_2M_SDE,
+    "SamplerDPMPP_SDE": SamplerDPMPP_SDE,
+    "BasicScheduler": BasicScheduler,
+    "SplitSigmas": SplitSigmas,
+}
diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py
new file mode 100644
index 000000000..07a88bd96
--- /dev/null
+++ b/comfy_extras/nodes_freelunch.py
@@ -0,0 +1,67 @@
+#code originally taken from: https://github.com/ChenyangSi/FreeU (under MIT License)
+
+import torch
+
+
+def Fourier_filter(x, threshold, scale):
+    # FFT
+    x_freq = torch.fft.fftn(x.float(), dim=(-2, -1))
+    x_freq = torch.fft.fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W), device=x.device)
+
+    crow, ccol = H // 2, W //2
+    mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = torch.fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = torch.fft.ifftn(x_freq, dim=(-2, -1)).real
+
+    return x_filtered.to(x.dtype)
+
+
+class FreeU:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "model": ("MODEL",),
+                             "b1": ("FLOAT", {"default": 1.1, "min": 0.0, "max": 10.0, "step": 0.01}),
+                             "b2": ("FLOAT", {"default": 1.2, "min": 0.0, "max": 10.0, "step": 0.01}),
+                             "s1": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 10.0, "step": 0.01}),
+                             "s2": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 10.0, "step": 0.01}),
+                              }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+
+    CATEGORY = "_for_testing"
+
+    def patch(self, model, b1, b2, s1, s2):
+        model_channels = model.model.model_config.unet_config["model_channels"]
+        scale_dict = {model_channels * 4: (b1, s1), model_channels * 2: (b2, s2)}
+        on_cpu_devices = {}
+
+        def output_block_patch(h, hsp, transformer_options):
+            scale = scale_dict.get(h.shape[1], None)
+            if scale is not None:
+                h[:,:h.shape[1] // 2] = h[:,:h.shape[1] // 2] * scale[0]
+                if hsp.device not in on_cpu_devices:
+                    try:
+                        hsp = Fourier_filter(hsp, threshold=1, scale=scale[1])
+                    except:
+                        print("Device", hsp.device, "does not support the torch.fft functions used in the FreeU node, switching to CPU.")
+                        on_cpu_devices[hsp.device] = True
+                        hsp = Fourier_filter(hsp.cpu(), threshold=1, scale=scale[1]).to(hsp.device)
+                else:
+                    hsp = Fourier_filter(hsp.cpu(), threshold=1, scale=scale[1]).to(hsp.device)
+
+            return h, hsp
+
+        m = model.clone()
+        m.set_model_output_block_patch(output_block_patch)
+        return (m, )
+
+
+NODE_CLASS_MAPPINGS = {
+    "FreeU": FreeU,
+}
diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py
new file mode 100644
index 000000000..001de39fc
--- /dev/null
+++ b/comfy_extras/nodes_latent.py
@@ -0,0 +1,74 @@
+import comfy.utils
+
+def reshape_latent_to(target_shape, latent):
+    if latent.shape[1:] != target_shape[1:]:
+        latent.movedim(1, -1)
+        latent = comfy.utils.common_upscale(latent, target_shape[3], target_shape[2], "bilinear", "center")
+        latent.movedim(-1, 1)
+    return comfy.utils.repeat_to_batch_size(latent, target_shape[0])
+
+
+class LatentAdd:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}}
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "op"
+
+    CATEGORY = "latent/advanced"
+
+    def op(self, samples1, samples2):
+        samples_out = samples1.copy()
+
+        s1 = samples1["samples"]
+        s2 = samples2["samples"]
+
+        s2 = reshape_latent_to(s1.shape, s2)
+        samples_out["samples"] = s1 + s2
+        return (samples_out,)
+
+class LatentSubtract:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}}
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "op"
+
+    CATEGORY = "latent/advanced"
+
+    def op(self, samples1, samples2):
+        samples_out = samples1.copy()
+
+        s1 = samples1["samples"]
+        s2 = samples2["samples"]
+
+        s2 = reshape_latent_to(s1.shape, s2)
+        samples_out["samples"] = s1 - s2
+        return (samples_out,)
+
+class LatentMultiply:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "samples": ("LATENT",),
+                              "multiplier": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}),
+                             }}
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "op"
+
+    CATEGORY = "latent/advanced"
+
+    def op(self, samples, multiplier):
+        samples_out = samples.copy()
+
+        s1 = samples["samples"]
+        samples_out["samples"] = s1 * multiplier
+        return (samples_out,)
+
+NODE_CLASS_MAPPINGS = {
+    "LatentAdd": LatentAdd,
+    "LatentSubtract": LatentSubtract,
+    "LatentMultiply": LatentMultiply,
+}
diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py
index 43f623a62..9b0b289c1 100644
--- a/comfy_extras/nodes_mask.py
+++ b/comfy_extras/nodes_mask.py
@@ -1,6 +1,7 @@
 import numpy as np
-from scipy.ndimage import grey_dilation
+import scipy.ndimage
 import torch
+import comfy.utils
 
 from nodes import MAX_RESOLUTION
 
@@ -8,6 +9,8 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
     if resize_source:
         source = torch.nn.functional.interpolate(source, size=(destination.shape[2], destination.shape[3]), mode="bilinear")
 
+    source = comfy.utils.repeat_to_batch_size(source, destination.shape[0])
+
     x = max(-source.shape[3] * multiplier, min(x, destination.shape[3] * multiplier))
     y = max(-source.shape[2] * multiplier, min(y, destination.shape[2] * multiplier))
 
@@ -18,8 +21,8 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
         mask = torch.ones_like(source)
     else:
         mask = mask.clone()
-        mask = torch.nn.functional.interpolate(mask[None, None], size=(source.shape[2], source.shape[3]), mode="bilinear")
-        mask = mask.repeat((source.shape[0], source.shape[1], 1, 1))
+        mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[2], source.shape[3]), mode="bilinear")
+        mask = comfy.utils.repeat_to_batch_size(mask, source.shape[0])
 
     # calculate the bounds of the source that will be overlapping the destination
     # this prevents the source trying to overwrite latent pixels that are out of bounds
@@ -111,7 +114,7 @@ class ImageToMask:
         return {
                 "required": {
                     "image": ("IMAGE",),
-                    "channel": (["red", "green", "blue"],),
+                    "channel": (["red", "green", "blue", "alpha"],),
                 }
         }
 
@@ -121,8 +124,8 @@ class ImageToMask:
     FUNCTION = "image_to_mask"
 
     def image_to_mask(self, image, channel):
-        channels = ["red", "green", "blue"]
-        mask = image[0, :, :, channels.index(channel)]
+        channels = ["red", "green", "blue", "alpha"]
+        mask = image[:, :, :, channels.index(channel)]
         return (mask,)
 
 class ImageColorToMask:
@@ -141,8 +144,8 @@ class ImageColorToMask:
     FUNCTION = "image_to_mask"
 
     def image_to_mask(self, image, color):
-        temp = (torch.clamp(image[0], 0, 1.0) * 255.0).round().to(torch.int)
-        temp = torch.bitwise_left_shift(temp[:,:,0], 16) + torch.bitwise_left_shift(temp[:,:,1], 8) + temp[:,:,2]
+        temp = (torch.clamp(image, 0, 1.0) * 255.0).round().to(torch.int)
+        temp = torch.bitwise_left_shift(temp[:,:,:,0], 16) + torch.bitwise_left_shift(temp[:,:,:,1], 8) + temp[:,:,:,2]
         mask = torch.where(temp == color, 255, 0).float()
         return (mask,)
 
@@ -164,7 +167,7 @@ class SolidMask:
     FUNCTION = "solid"
 
     def solid(self, value, width, height):
-        out = torch.full((height, width), value, dtype=torch.float32, device="cpu")
+        out = torch.full((1, height, width), value, dtype=torch.float32, device="cpu")
         return (out,)
 
 class InvertMask:
@@ -206,7 +209,8 @@ class CropMask:
     FUNCTION = "crop"
 
     def crop(self, mask, x, y, width, height):
-        out = mask[y:y + height, x:x + width]
+        mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1]))
+        out = mask[:, y:y + height, x:x + width]
         return (out,)
 
 class MaskComposite:
@@ -229,27 +233,28 @@ class MaskComposite:
     FUNCTION = "combine"
 
     def combine(self, destination, source, x, y, operation):
-        output = destination.clone()
+        output = destination.reshape((-1, destination.shape[-2], destination.shape[-1])).clone()
+        source = source.reshape((-1, source.shape[-2], source.shape[-1]))
 
         left, top = (x, y,)
-        right, bottom = (min(left + source.shape[1], destination.shape[1]), min(top + source.shape[0], destination.shape[0]))
+        right, bottom = (min(left + source.shape[-1], destination.shape[-1]), min(top + source.shape[-2], destination.shape[-2]))
         visible_width, visible_height = (right - left, bottom - top,)
 
         source_portion = source[:visible_height, :visible_width]
         destination_portion = destination[top:bottom, left:right]
 
         if operation == "multiply":
-            output[top:bottom, left:right] = destination_portion * source_portion
+            output[:, top:bottom, left:right] = destination_portion * source_portion
         elif operation == "add":
-            output[top:bottom, left:right] = destination_portion + source_portion
+            output[:, top:bottom, left:right] = destination_portion + source_portion
         elif operation == "subtract":
-            output[top:bottom, left:right] = destination_portion - source_portion
+            output[:, top:bottom, left:right] = destination_portion - source_portion
         elif operation == "and":
-            output[top:bottom, left:right] = torch.bitwise_and(destination_portion.round().bool(), source_portion.round().bool()).float()
+            output[:, top:bottom, left:right] = torch.bitwise_and(destination_portion.round().bool(), source_portion.round().bool()).float()
         elif operation == "or":
-            output[top:bottom, left:right] = torch.bitwise_or(destination_portion.round().bool(), source_portion.round().bool()).float()
+            output[:, top:bottom, left:right] = torch.bitwise_or(destination_portion.round().bool(), source_portion.round().bool()).float()
         elif operation == "xor":
-            output[top:bottom, left:right] = torch.bitwise_xor(destination_portion.round().bool(), source_portion.round().bool()).float()
+            output[:, top:bottom, left:right] = torch.bitwise_xor(destination_portion.round().bool(), source_portion.round().bool()).float()
 
         output = torch.clamp(output, 0.0, 1.0)
 
@@ -275,7 +280,7 @@ class FeatherMask:
     FUNCTION = "feather"
 
     def feather(self, mask, left, top, right, bottom):
-        output = mask.clone()
+        output = mask.reshape((-1, mask.shape[-2], mask.shape[-1])).clone()
 
         left = min(left, output.shape[1])
         right = min(right, output.shape[1])
@@ -284,19 +289,19 @@ class FeatherMask:
 
         for x in range(left):
             feather_rate = (x + 1.0) / left
-            output[:, x] *= feather_rate
+            output[:, :, x] *= feather_rate
 
         for x in range(right):
             feather_rate = (x + 1) / right
-            output[:, -x] *= feather_rate
+            output[:, :, -x] *= feather_rate
 
         for y in range(top):
             feather_rate = (y + 1) / top
-            output[y, :] *= feather_rate
+            output[:, y, :] *= feather_rate
 
         for y in range(bottom):
             feather_rate = (y + 1) / bottom
-            output[-y, :] *= feather_rate
+            output[:, -y, :] *= feather_rate
 
         return (output,)
     
@@ -306,7 +311,7 @@ class GrowMask:
         return {
             "required": {
                 "mask": ("MASK",),
-                "expand": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}),
+                "expand": ("INT", {"default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1}),
                 "tapered_corners": ("BOOLEAN", {"default": True}),
             },
         }
@@ -322,12 +327,18 @@ class GrowMask:
         kernel = np.array([[c, 1, c],
                            [1, 1, 1],
                            [c, 1, c]])
-        output = mask.numpy().copy()
-        while expand > 0:
-            output = grey_dilation(output, footprint=kernel)
-            expand -= 1
-        output = torch.from_numpy(output)
-        return (output,)
+        mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1]))
+        out = []
+        for m in mask:
+            output = m.numpy()
+            for _ in range(abs(expand)):
+                if expand < 0:
+                    output = scipy.ndimage.grey_erosion(output, footprint=kernel)
+                else:
+                    output = scipy.ndimage.grey_dilation(output, footprint=kernel)
+            output = torch.from_numpy(output)
+            out.append(output)
+        return (torch.stack(out, dim=0),)
 
 
 
diff --git a/latent_preview.py b/latent_preview.py
index 87240a582..740e08607 100644
--- a/latent_preview.py
+++ b/latent_preview.py
@@ -5,6 +5,7 @@ import numpy as np
 from comfy.cli_args import args, LatentPreviewMethod
 from comfy.taesd.taesd import TAESD
 import folder_paths
+import comfy.utils
 
 MAX_PREVIEW_RESOLUTION = 512
 
@@ -74,4 +75,21 @@ def get_previewer(device, latent_format):
                 previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors)
     return previewer
 
+def prepare_callback(model, steps, x0_output_dict=None):
+    preview_format = "JPEG"
+    if preview_format not in ["JPEG", "PNG"]:
+        preview_format = "JPEG"
+
+    previewer = get_previewer(model.load_device, model.model.latent_format)
+
+    pbar = comfy.utils.ProgressBar(steps)
+    def callback(step, x0, x, total_steps):
+        if x0_output_dict is not None:
+            x0_output_dict["x0"] = x0
+
+        preview_bytes = None
+        if previewer:
+            preview_bytes = previewer.decode_latent_to_preview_image(preview_format, x0)
+        pbar.update_absolute(step + 1, total_steps, preview_bytes)
+    return callback
 
diff --git a/nodes.py b/nodes.py
index 18d82ea80..16bf07cca 100644
--- a/nodes.py
+++ b/nodes.py
@@ -891,7 +891,7 @@ class EmptyLatentImage:
     def INPUT_TYPES(s):
         return {"required": { "width": ("INT", {"default": 512, "min": 16, "max": MAX_RESOLUTION, "step": 8}),
                               "height": ("INT", {"default": 512, "min": 16, "max": MAX_RESOLUTION, "step": 8}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 64})}}
+                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
     RETURN_TYPES = ("LATENT",)
     FUNCTION = "generate"
 
@@ -967,8 +967,8 @@ class LatentUpscale:
     @classmethod
     def INPUT_TYPES(s):
         return {"required": { "samples": ("LATENT",), "upscale_method": (s.upscale_methods,),
-                              "width": ("INT", {"default": 512, "min": 64, "max": MAX_RESOLUTION, "step": 8}),
-                              "height": ("INT", {"default": 512, "min": 64, "max": MAX_RESOLUTION, "step": 8}),
+                              "width": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 8}),
+                              "height": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 8}),
                               "crop": (s.crop_methods,)}}
     RETURN_TYPES = ("LATENT",)
     FUNCTION = "upscale"
@@ -976,8 +976,22 @@ class LatentUpscale:
     CATEGORY = "latent"
 
     def upscale(self, samples, upscale_method, width, height, crop):
-        s = samples.copy()
-        s["samples"] = comfy.utils.common_upscale(samples["samples"], width // 8, height // 8, upscale_method, crop)
+        if width == 0 and height == 0:
+            s = samples
+        else:
+            s = samples.copy()
+
+            if width == 0:
+                height = max(64, height)
+                width = max(64, round(samples["samples"].shape[3] * height / samples["samples"].shape[2]))
+            elif height == 0:
+                width = max(64, width)
+                height = max(64, round(samples["samples"].shape[2] * width / samples["samples"].shape[3]))
+            else:
+                width = max(64, width)
+                height = max(64, height)
+
+            s["samples"] = comfy.utils.common_upscale(samples["samples"], width // 8, height // 8, upscale_method, crop)
         return (s,)
 
 class LatentUpscaleBy:
@@ -1175,11 +1189,8 @@ class SetLatentNoiseMask:
         s["noise_mask"] = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1]))
         return (s,)
 
-
 def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False):
-    device = comfy.model_management.get_torch_device()
     latent_image = latent["samples"]
-
     if disable_noise:
         noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
     else:
@@ -1190,22 +1201,11 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
     if "noise_mask" in latent:
         noise_mask = latent["noise_mask"]
 
-    preview_format = "JPEG"
-    if preview_format not in ["JPEG", "PNG"]:
-        preview_format = "JPEG"
-
-    previewer = latent_preview.get_previewer(device, model.model.latent_format)
-
-    pbar = comfy.utils.ProgressBar(steps)
-    def callback(step, x0, x, total_steps):
-        preview_bytes = None
-        if previewer:
-            preview_bytes = previewer.decode_latent_to_preview_image(preview_format, x0)
-        pbar.update_absolute(step + 1, total_steps, preview_bytes)
-
+    callback = latent_preview.prepare_callback(model, steps)
+    disable_pbar = False
     samples = comfy.sample.sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image,
                                   denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step,
-                                  force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, seed=seed)
+                                  force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
     out = latent.copy()
     out["samples"] = samples
     return (out, )
@@ -1355,7 +1355,7 @@ class LoadImage:
             mask = 1. - torch.from_numpy(mask)
         else:
             mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
-        return (image, mask)
+        return (image, mask.unsqueeze(0))
 
     @classmethod
     def IS_CHANGED(s, image):
@@ -1402,7 +1402,7 @@ class LoadImageMask:
                 mask = 1. - mask
         else:
             mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
-        return (mask,)
+        return (mask.unsqueeze(0),)
 
     @classmethod
     def IS_CHANGED(s, image, channel):
@@ -1429,8 +1429,8 @@ class ImageScale:
     @classmethod
     def INPUT_TYPES(s):
         return {"required": { "image": ("IMAGE",), "upscale_method": (s.upscale_methods,),
-                              "width": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}),
-                              "height": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}),
+                              "width": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1}),
+                              "height": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1}),
                               "crop": (s.crop_methods,)}}
     RETURN_TYPES = ("IMAGE",)
     FUNCTION = "upscale"
@@ -1438,9 +1438,18 @@ class ImageScale:
     CATEGORY = "image/upscaling"
 
     def upscale(self, image, upscale_method, width, height, crop):
-        samples = image.movedim(-1,1)
-        s = comfy.utils.common_upscale(samples, width, height, upscale_method, crop)
-        s = s.movedim(1,-1)
+        if width == 0 and height == 0:
+            s = image
+        else:
+            samples = image.movedim(-1,1)
+
+            if width == 0:
+                width = max(1, round(samples.shape[3] * height / samples.shape[2]))
+            elif height == 0:
+                height = max(1, round(samples.shape[2] * width / samples.shape[3]))
+
+            s = comfy.utils.common_upscale(samples, width, height, upscale_method, crop)
+            s = s.movedim(1,-1)
         return (s,)
 
 class ImageScaleBy:
@@ -1503,7 +1512,7 @@ class EmptyImage:
     def INPUT_TYPES(s):
         return {"required": { "width": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}),
                               "height": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 64}),
+                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
                               "color": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFF, "step": 1, "display": "color"}),
                               }}
     RETURN_TYPES = ("IMAGE",)
@@ -1604,7 +1613,7 @@ NODE_CLASS_MAPPINGS = {
     "ImageBatch": ImageBatch,
     "ImagePadForOutpaint": ImagePadForOutpaint,
     "EmptyImage": EmptyImage,
-    "ConditioningAverage ": ConditioningAverage ,
+    "ConditioningAverage": ConditioningAverage ,
     "ConditioningCombine": ConditioningCombine,
     "ConditioningConcat": ConditioningConcat,
     "ConditioningSetArea": ConditioningSetArea,
@@ -1772,13 +1781,24 @@ def load_custom_nodes():
         print()
 
 def init_custom_nodes():
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_hypernetwork.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_upscale_model.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_post_processing.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_mask.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_rebatch.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_model_merging.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_tomesd.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_clip_sdxl.py"))
-    load_custom_node(os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras"), "nodes_canny.py"))
+    extras_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras")
+    extras_files = [
+        "nodes_latent.py",
+        "nodes_hypernetwork.py",
+        "nodes_upscale_model.py",
+        "nodes_post_processing.py",
+        "nodes_mask.py",
+        "nodes_compositing.py",
+        "nodes_rebatch.py",
+        "nodes_model_merging.py",
+        "nodes_tomesd.py",
+        "nodes_clip_sdxl.py",
+        "nodes_canny.py",
+        "nodes_freelunch.py",
+        "nodes_custom_sampler.py"
+    ]
+
+    for node_file in extras_files:
+        load_custom_node(os.path.join(extras_dir, node_file))
+
     load_custom_nodes()
diff --git a/server.py b/server.py
index b2e16716b..63f337a87 100644
--- a/server.py
+++ b/server.py
@@ -413,7 +413,11 @@ class PromptServer():
         async def get_object_info(request):
             out = {}
             for x in nodes.NODE_CLASS_MAPPINGS:
-                out[x] = node_info(x)
+                try:
+                    out[x] = node_info(x)
+                except Exception as e:
+                    print(f"[ERROR] An error occurred while retrieving information for the '{x}' node.", file=sys.stderr)
+                    traceback.print_exc()
             return web.json_response(out)
 
         @routes.get("/object_info/{node_class}")
diff --git a/web/extensions/core/widgetInputs.js b/web/extensions/core/widgetInputs.js
index 606605f0a..ce05a29e9 100644
--- a/web/extensions/core/widgetInputs.js
+++ b/web/extensions/core/widgetInputs.js
@@ -3,6 +3,13 @@ import { app } from "../../scripts/app.js";
 
 const CONVERTED_TYPE = "converted-widget";
 const VALID_TYPES = ["STRING", "combo", "number", "BOOLEAN"];
+const CONFIG = Symbol();
+const GET_CONFIG = Symbol();
+
+function getConfig(widgetName) {
+	const { nodeData } = this.constructor;
+	return nodeData?.input?.required[widgetName] ?? nodeData?.input?.optional?.[widgetName];
+}
 
 function isConvertableWidget(widget, config) {
 	return (VALID_TYPES.includes(widget.type) || VALID_TYPES.includes(config[0])) && !widget.options?.forceInput;
@@ -55,12 +62,12 @@ function showWidget(widget) {
 function convertToInput(node, widget, config) {
 	hideWidget(node, widget);
 
-	const { linkType } = getWidgetType(config);
+	const { type } = getWidgetType(config);
 
 	// Add input and store widget config for creating on primitive node
 	const sz = node.size;
-	node.addInput(widget.name, linkType, {
-		widget: { name: widget.name, config },
+	node.addInput(widget.name, type, {
+		widget: { name: widget.name, [GET_CONFIG]: () => config },
 	});
 
 	for (const widget of node.widgets) {
@@ -87,12 +94,10 @@ function convertToWidget(node, widget) {
 function getWidgetType(config) {
 	// Special handling for COMBO so we restrict links based on the entries
 	let type = config[0];
-	let linkType = type;
 	if (type instanceof Array) {
 		type = "COMBO";
-		linkType = linkType.join(",");
 	}
-	return { type, linkType };
+	return { type };
 }
 
 app.registerExtension({
@@ -116,7 +121,7 @@ app.registerExtension({
 							callback: () => convertToWidget(this, w),
 						});
 					} else {
-						const config = nodeData?.input?.required[w.name] || nodeData?.input?.optional?.[w.name] || [w.type, w.options || {}];
+						const config = getConfig.call(this, w.name) ?? [w.type, w.options || {}];
 						if (isConvertableWidget(w, config)) {
 							toInput.push({
 								content: `Convert ${w.name} to input`,
@@ -137,33 +142,67 @@ app.registerExtension({
 			return r;
 		};
 
-		const origOnNodeCreated = nodeType.prototype.onNodeCreated
+		nodeType.prototype.onGraphConfigured = function () {
+			if (!this.inputs) return;
+
+			for (const input of this.inputs) {
+				if (input.widget) {
+					if (!input.widget[GET_CONFIG]) {
+						input.widget[GET_CONFIG] = () => getConfig.call(this, input.widget.name);
+					}
+
+					// Cleanup old widget config
+					if (input.widget.config) {
+						if (input.widget.config[0] instanceof Array) {
+							// If we are an old converted combo then replace the input type and the stored link data
+							input.type = "COMBO";
+
+							const link = app.graph.links[input.link];
+							if (link) {
+								link.type = input.type;
+							}
+						}
+						delete input.widget.config;
+					}
+
+					const w = this.widgets.find((w) => w.name === input.widget.name);
+					if (w) {
+						hideWidget(this, w);
+					} else {
+						convertToWidget(this, input);
+					}
+				}
+			}
+		};
+
+		const origOnNodeCreated = nodeType.prototype.onNodeCreated;
 		nodeType.prototype.onNodeCreated = function () {
 			const r = origOnNodeCreated ? origOnNodeCreated.apply(this) : undefined;
-			if (this.widgets) {
+
+			// When node is created, convert any force/default inputs
+			if (!app.configuringGraph && this.widgets) {
 				for (const w of this.widgets) {
 					if (w?.options?.forceInput || w?.options?.defaultInput) {
-						const config = nodeData?.input?.required[w.name] || nodeData?.input?.optional?.[w.name] || [w.type, w.options || {}];
+						const config = getConfig.call(this, w.name) ?? [w.type, w.options || {}];
 						convertToInput(this, w, config);
 					}
 				}
 			}
-			return r;
-		}
 
-		// On initial configure of nodes hide all converted widgets
+			return r;
+		};
+
 		const origOnConfigure = nodeType.prototype.onConfigure;
 		nodeType.prototype.onConfigure = function () {
 			const r = origOnConfigure ? origOnConfigure.apply(this, arguments) : undefined;
-
-			if (this.inputs) {
+			if (!app.configuringGraph && this.inputs) {
+				// On copy + paste of nodes, ensure that widget configs are set up
 				for (const input of this.inputs) {
-					if (input.widget && !input.widget.config[1]?.forceInput) {
+					if (input.widget && !input.widget[GET_CONFIG]) {
+						input.widget[GET_CONFIG] = () => getConfig.call(this, input.widget.name);
 						const w = this.widgets.find((w) => w.name === input.widget.name);
 						if (w) {
 							hideWidget(this, w);
-						} else {
-							convertToWidget(this, input)
 						}
 					}
 				}
@@ -190,7 +229,7 @@ app.registerExtension({
 			const input = this.inputs[slot];
 			if (!input.widget || !input[ignoreDblClick]) {
 				// Not a widget input or already handled input
-				if (!(input.type in ComfyWidgets) && !(input.widget.config?.[0] instanceof Array)) {
+				if (!(input.type in ComfyWidgets) && !(input.widget[GET_CONFIG]?.()?.[0] instanceof Array)) {
 					return r; //also Not a ComfyWidgets input or combo (do nothing)
 				}
 			}
@@ -262,20 +301,55 @@ app.registerExtension({
 				}
 			}
 
-			onConnectionsChange(_, index, connected) {
-				if (connected) {
-					if (this.outputs[0].links?.length) {
-						if (!this.widgets?.length) {
-							this.#onFirstConnection();
-						}
-						if (!this.widgets?.length && this.outputs[0].widget) {
-							// On first load it often cant recreate the widget as the other node doesnt exist yet
-							// Manually recreate it from the output info
-							this.#createWidget(this.outputs[0].widget.config);
+			refreshComboInNode() {
+				const widget = this.widgets?.[0];
+				if (widget?.type === "combo") {
+					widget.options.values = this.outputs[0].widget[GET_CONFIG]()[0];
+
+					if (!widget.options.values.includes(widget.value)) {
+						widget.value = widget.options.values[0];
+						widget.callback(widget.value);
+					}
+				}
+			}
+
+			onAfterGraphConfigured() {
+				if (this.outputs[0].links?.length && !this.widgets?.length) {
+					this.#onFirstConnection();
+
+					// Populate widget values from config data
+					if (this.widgets) {
+						for (let i = 0; i < this.widgets_values.length; i++) {
+							const w = this.widgets[i];
+							if (w) {
+								w.value = this.widgets_values[i];
+							}
 						}
 					}
-				} else if (!this.outputs[0].links?.length) {
-					this.#onLastDisconnect();
+
+					// Merge values if required
+					this.#mergeWidgetConfig();
+				}
+			}
+
+			onConnectionsChange(_, index, connected) {
+				if (app.configuringGraph) {
+					// Dont run while the graph is still setting up
+					return;
+				}
+
+				const links = this.outputs[0].links;
+				if (connected) {
+					if (links?.length && !this.widgets?.length) {
+						this.#onFirstConnection();
+					}
+				} else {
+					// We may have removed a link that caused the constraints to change
+					this.#mergeWidgetConfig();
+
+					if (!links?.length) {
+						this.#onLastDisconnect();
+					}
 				}
 			}
 
@@ -292,7 +366,7 @@ app.registerExtension({
 				}
 			}
 
-			#onFirstConnection() {
+			#onFirstConnection(recreating) {
 				// First connection can fire before the graph is ready on initial load so random things can be missing
 				const linkId = this.outputs[0].links[0];
 				const link = this.graph.links[linkId];
@@ -304,26 +378,24 @@ app.registerExtension({
 				const input = theirNode.inputs[link.target_slot];
 				if (!input) return;
 
-
-				var _widget;
+				let widget;
 				if (!input.widget) {
 					if (!(input.type in ComfyWidgets)) return;
-					_widget = { "name": input.name, "config": [input.type, {}] }//fake widget
+					widget = { name: input.name, [GET_CONFIG]: () => [input.type, {}] }; //fake widget
 				} else {
-					_widget = input.widget;
+					widget = input.widget;
 				}
 
-				const widget = _widget;
-				const { type, linkType } = getWidgetType(widget.config);
+				const { type } = getWidgetType(widget[GET_CONFIG]());
 				// Update our output to restrict to the widget type
-				this.outputs[0].type = linkType;
+				this.outputs[0].type = type;
 				this.outputs[0].name = type;
 				this.outputs[0].widget = widget;
 
-				this.#createWidget(widget.config, theirNode, widget.name);
+				this.#createWidget(widget[CONFIG] ?? widget[GET_CONFIG](), theirNode, widget.name, recreating);
 			}
 
-			#createWidget(inputData, node, widgetName) {
+			#createWidget(inputData, node, widgetName, recreating) {
 				let type = inputData[0];
 
 				if (type instanceof Array) {
@@ -334,7 +406,7 @@ app.registerExtension({
 				if (type in ComfyWidgets) {
 					widget = (ComfyWidgets[type](this, "value", inputData, app) || {}).widget;
 				} else {
-					widget = this.addWidget(type, "value", null, () => { }, {});
+					widget = this.addWidget(type, "value", null, () => {}, {});
 				}
 
 				if (node?.widgets && widget) {
@@ -358,60 +430,188 @@ app.registerExtension({
 					return r;
 				};
 
-				// Grow our node if required
-				const sz = this.computeSize();
-				if (this.size[0] < sz[0]) {
-					this.size[0] = sz[0];
-				}
-				if (this.size[1] < sz[1]) {
-					this.size[1] = sz[1];
-				}
-
-				requestAnimationFrame(() => {
-					if (this.onResize) {
-						this.onResize(this.size);
+				if (!recreating) {
+					// Grow our node if required
+					const sz = this.computeSize();
+					if (this.size[0] < sz[0]) {
+						this.size[0] = sz[0];
 					}
-				});
+					if (this.size[1] < sz[1]) {
+						this.size[1] = sz[1];
+					}
+
+					requestAnimationFrame(() => {
+						if (this.onResize) {
+							this.onResize(this.size);
+						}
+					});
+				}
 			}
 
-			#isValidConnection(input) {
+			#recreateWidget() {
+				const values = this.widgets.map((w) => w.value);
+				this.#removeWidgets();
+				this.#onFirstConnection(true);
+				for (let i = 0; i < this.widgets?.length; i++) this.widgets[i].value = values[i];
+			}
+
+			#mergeWidgetConfig() {
+				// Merge widget configs if the node has multiple outputs
+				const output = this.outputs[0];
+				const links = output.links;
+
+				const hasConfig = !!output.widget[CONFIG];
+				if (hasConfig) {
+					delete output.widget[CONFIG];
+				}
+
+				if (links?.length < 2 && hasConfig) {
+					// Copy the widget options from the source
+					if (links.length) {
+						this.#recreateWidget();
+					}
+
+					return;
+				}
+
+				const config1 = output.widget[GET_CONFIG]();
+				const isNumber = config1[0] === "INT" || config1[0] === "FLOAT";
+				if (!isNumber) return;
+
+				for (const linkId of links) {
+					const link = app.graph.links[linkId];
+					if (!link) continue; // Can be null when removing a node
+
+					const theirNode = app.graph.getNodeById(link.target_id);
+					const theirInput = theirNode.inputs[link.target_slot];
+
+					// Call is valid connection so it can merge the configs when validating
+					this.#isValidConnection(theirInput, hasConfig);
+				}
+			}
+
+			#isValidConnection(input, forceUpdate) {
 				// Only allow connections where the configs match
-				const config1 = this.outputs[0].widget.config;
-				const config2 = input.widget.config;
+				const output = this.outputs[0];
+				const config1 = output.widget[CONFIG] ?? output.widget[GET_CONFIG]();
+				const config2 = input.widget[GET_CONFIG]();
 
 				if (config1[0] instanceof Array) {
-					// These checks shouldnt actually be necessary as the types should match
-					// but double checking doesn't hurt
-
 					// New input isnt a combo
-					if (!(config2[0] instanceof Array)) return false;
+					if (!(config2[0] instanceof Array)) {
+						console.log(`connection rejected: tried to connect combo to ${config2[0]}`);
+						return false;
+					}
 					// New imput combo has a different size
-					if (config1[0].length !== config2[0].length) return false;
+					if (config1[0].length !== config2[0].length) {
+						console.log(`connection rejected: combo lists dont match`);
+						return false;
+					}
 					// New input combo has different elements
-					if (config1[0].find((v, i) => config2[0][i] !== v)) return false;
+					if (config1[0].find((v, i) => config2[0][i] !== v)) {
+						console.log(`connection rejected: combo lists dont match`);
+						return false;
+					}
 				} else if (config1[0] !== config2[0]) {
-					// Configs dont match
+					// Types dont match
+					console.log(`connection rejected: types dont match`, config1[0], config2[0]);
 					return false;
 				}
 
-				for (const k in config1[1]) {
-					if (k !== "default" && k !== 'forceInput') {
-						if (config1[1][k] !== config2[1][k]) {
-							return false;
+				const keys = new Set([...Object.keys(config1[1] ?? {}), ...Object.keys(config2[1] ?? {})]);
+
+				let customConfig;
+				const getCustomConfig = () => {
+					if (!customConfig) {
+						if (typeof structuredClone === "undefined") {
+							customConfig = JSON.parse(JSON.stringify(config1[1] ?? {}));
+						} else {
+							customConfig = structuredClone(config1[1] ?? {});
 						}
 					}
+					return customConfig;
+				};
+
+				const isNumber = config1[0] === "INT" || config1[0] === "FLOAT";
+				for (const k of keys.values()) {
+					if (k !== "default" && k !== "forceInput" && k !== "defaultInput") {
+						let v1 = config1[1][k];
+						let v2 = config2[1][k];
+
+						if (v1 === v2 || (!v1 && !v2)) continue;
+
+						if (isNumber) {
+							if (k === "min") {
+								const theirMax = config2[1]["max"];
+								if (theirMax != null && v1 > theirMax) {
+									console.log("connection rejected: min > max", v1, theirMax);
+									return false;
+								}
+								getCustomConfig()[k] = v1 == null ? v2 : v2 == null ? v1 : Math.max(v1, v2);
+								continue;
+							} else if (k === "max") {
+								const theirMin = config2[1]["min"];
+								if (theirMin != null && v1 < theirMin) {
+									console.log("connection rejected: max < min", v1, theirMin);
+									return false;
+								}
+								getCustomConfig()[k] = v1 == null ? v2 : v2 == null ? v1 : Math.min(v1, v2);
+								continue;
+							} else if (k === "step") {
+								let step;
+								if (v1 == null) {
+									// No current step
+									step = v2;
+								} else if (v2 == null) {
+									// No new step
+									step = v1;
+								} else {
+									if (v1 < v2) {
+										// Ensure v1 is larger for the mod
+										const a = v2;
+										v2 = v1;
+										v1 = a;
+									}
+									if (v1 % v2) {
+										console.log("connection rejected: steps not divisible", "current:", v1, "new:", v2);
+										return false;
+									}
+
+									step = v1;
+								}
+
+								getCustomConfig()[k] = step;
+								continue;
+							}
+						}
+
+						console.log(`connection rejected: config ${k} values dont match`, v1, v2);
+						return false;
+					}
+				}
+
+				if (customConfig || forceUpdate) {
+					if (customConfig) {
+						output.widget[CONFIG] = [config1[0], customConfig];
+					}
+
+					this.#recreateWidget();
+
+					const widget = this.widgets[0];
+					// When deleting a node this can be null
+					if (widget) {
+						const min = widget.options.min;
+						const max = widget.options.max;
+						if (min != null && widget.value < min) widget.value = min;
+						if (max != null && widget.value > max) widget.value = max;
+						widget.callback(widget.value);
+					}
 				}
 
 				return true;
 			}
 
-			#onLastDisconnect() {
-				// We cant remove + re-add the output here as if you drag a link over the same link
-				// it removes, then re-adds, causing it to break
-				this.outputs[0].type = "*";
-				this.outputs[0].name = "connect to widget input";
-				delete this.outputs[0].widget;
-
+			#removeWidgets() {
 				if (this.widgets) {
 					// Allow widgets to cleanup
 					for (const w of this.widgets) {
@@ -422,6 +622,16 @@ app.registerExtension({
 					this.widgets.length = 0;
 				}
 			}
+
+			#onLastDisconnect() {
+				// We cant remove + re-add the output here as if you drag a link over the same link
+				// it removes, then re-adds, causing it to break
+				this.outputs[0].type = "*";
+				this.outputs[0].name = "connect to widget input";
+				delete this.outputs[0].widget;
+
+				this.#removeWidgets();
+			}
 		}
 
 		LiteGraph.registerNodeType(
diff --git a/web/scripts/app.js b/web/scripts/app.js
index 5efe08c00..7698d0f11 100644
--- a/web/scripts/app.js
+++ b/web/scripts/app.js
@@ -450,6 +450,47 @@ export class ComfyApp {
 					}
 				}
 
+				function calculateGrid(w, h, n) {
+					let columns, rows, cellsize;
+
+					if (w > h) {
+						cellsize = h;
+						columns = Math.ceil(w / cellsize);
+						rows = Math.ceil(n / columns);
+					} else {
+						cellsize = w;
+						rows = Math.ceil(h / cellsize);
+						columns = Math.ceil(n / rows);
+					}
+
+					while (columns * rows < n) {
+						cellsize++;
+						if (w >= h) {
+							columns = Math.ceil(w / cellsize);
+							rows = Math.ceil(n / columns);
+						} else {
+							rows = Math.ceil(h / cellsize);
+							columns = Math.ceil(n / rows);
+						}
+					}
+
+					const cell_size = Math.min(w/columns, h/rows);
+					return {cell_size, columns, rows};
+				}
+
+				function is_all_same_aspect_ratio(imgs) {
+					// assume: imgs.length >= 2
+					let ratio = imgs[0].naturalWidth/imgs[0].naturalHeight;
+
+					for(let i=1; i<imgs.length; i++) {
+						let this_ratio = imgs[i].naturalWidth/imgs[i].naturalHeight;
+						if(ratio != this_ratio)
+							return false;
+					}
+
+					return true;
+				}
+
 				if (this.imgs && this.imgs.length) {
 					const canvas = graph.list_of_graphcanvas[0];
 					const mouse = canvas.graph_mouse;
@@ -460,44 +501,60 @@ export class ComfyApp {
 						this.pointerDown = null;
 					}
 
-					let w = this.imgs[0].naturalWidth;
-					let h = this.imgs[0].naturalHeight;
 					let imageIndex = this.imageIndex;
 					const numImages = this.imgs.length;
 					if (numImages === 1 && !imageIndex) {
 						this.imageIndex = imageIndex = 0;
 					}
 
-					const shiftY = getImageTop(this);
+					const top = getImageTop(this);
+					var shiftY = top;
 
 					let dw = this.size[0];
 					let dh = this.size[1];
 					dh -= shiftY;
 
 					if (imageIndex == null) {
-						let best = 0;
-						let cellWidth;
-						let cellHeight;
-						let cols = 0;
-						let shiftX = 0;
-						for (let c = 1; c <= numImages; c++) {
-							const rows = Math.ceil(numImages / c);
-							const cW = dw / c;
-							const cH = dh / rows;
-							const scaleX = cW / w;
-							const scaleY = cH / h;
+						var cellWidth, cellHeight, shiftX, cell_padding, cols;
 
-							const scale = Math.min(scaleX, scaleY, 1);
-							const imageW = w * scale;
-							const imageH = h * scale;
-							const area = imageW * imageH * numImages;
+						const compact_mode = is_all_same_aspect_ratio(this.imgs);
+						if(!compact_mode) {
+							// use rectangle cell style and border line
+							cell_padding = 2;
+							const { cell_size, columns, rows } = calculateGrid(dw, dh, numImages);
+							cols = columns;
 
-							if (area > best) {
-								best = area;
-								cellWidth = imageW;
-								cellHeight = imageH;
-								cols = c;
-								shiftX = c * ((cW - imageW) / 2);
+							cellWidth = cell_size;
+							cellHeight = cell_size;
+							shiftX = (dw-cell_size*cols)/2;
+							shiftY = (dh-cell_size*rows)/2 + top;
+						}
+						else {
+							cell_padding = 0;
+							let best = 0;
+							let w = this.imgs[0].naturalWidth;
+							let h = this.imgs[0].naturalHeight;
+
+							// compact style
+							for (let c = 1; c <= numImages; c++) {
+								const rows = Math.ceil(numImages / c);
+								const cW = dw / c;
+								const cH = dh / rows;
+								const scaleX = cW / w;
+								const scaleY = cH / h;
+
+								const scale = Math.min(scaleX, scaleY, 1);
+								const imageW = w * scale;
+								const imageH = h * scale;
+								const area = imageW * imageH * numImages;
+
+								if (area > best) {
+									best = area;
+									cellWidth = imageW;
+									cellHeight = imageH;
+									cols = c;
+									shiftX = c * ((cW - imageW) / 2);
+								}
 							}
 						}
 
@@ -542,7 +599,14 @@ export class ComfyApp {
 							let imgWidth = ratio * img.width;
 							let imgX = col * cellWidth + shiftX + (cellWidth - imgWidth)/2;
 
-							ctx.drawImage(img, imgX, imgY, imgWidth, imgHeight);
+							ctx.drawImage(img, imgX+cell_padding, imgY+cell_padding, imgWidth-cell_padding*2, imgHeight-cell_padding*2);
+							if(!compact_mode) {
+								// rectangle cell and border line style
+								ctx.strokeStyle = "#8F8F8F";
+								ctx.lineWidth = 1;
+								ctx.strokeRect(x+cell_padding, y+cell_padding, cellWidth-cell_padding*2, cellHeight-cell_padding*2);
+							}
+
 							ctx.filter = "none";
 						}
 
@@ -552,6 +616,9 @@ export class ComfyApp {
 						}
 					} else {
 						// Draw individual
+						let w = this.imgs[imageIndex].naturalWidth;
+						let h = this.imgs[imageIndex].naturalHeight;
+
 						const scaleX = dw / w;
 						const scaleY = dh / h;
 						const scale = Math.min(scaleX, scaleY, 1);
@@ -594,14 +661,14 @@ export class ComfyApp {
 						};
 
 						if (numImages > 1) {
-							if (drawButton(x + w - 35, y + h - 35, 30, `${this.imageIndex + 1}/${numImages}`)) {
+							if (drawButton(dw - 40, dh + top - 40, 30, `${this.imageIndex + 1}/${numImages}`)) {
 								let i = this.imageIndex + 1 >= numImages ? 0 : this.imageIndex + 1;
 								if (!this.pointerDown || !this.pointerDown.index === i) {
 									this.pointerDown = { index: i, pos: [...mouse] };
 								}
 							}
 
-							if (drawButton(x + w - 35, y + 5, 30, `x`)) {
+							if (drawButton(dw - 40, top + 10, 30, `x`)) {
 								if (!this.pointerDown || !this.pointerDown.index === null) {
 									this.pointerDown = { index: null, pos: [...mouse] };
 								}
@@ -1114,6 +1181,40 @@ export class ComfyApp {
 		});
 	}
 
+	#addConfigureHandler() {
+		const app = this;
+		const configure = LGraph.prototype.configure;
+		// Flag that the graph is configuring to prevent nodes from running checks while its still loading
+		LGraph.prototype.configure = function () {
+			app.configuringGraph = true;
+			try {
+				return configure.apply(this, arguments);
+			} finally {
+				app.configuringGraph = false;
+			}
+		};
+	}
+
+	#addAfterConfigureHandler() {
+		const app = this;
+		const onConfigure = app.graph.onConfigure;
+		app.graph.onConfigure = function () {
+			// Fire callbacks before the onConfigure, this is used by widget inputs to setup the config
+			for (const node of app.graph._nodes) {
+				node.onGraphConfigured?.();
+			}
+			
+			const r = onConfigure?.apply(this, arguments);
+			
+			// Fire after onConfigure, used by primitves to generate widget using input nodes config
+			for (const node of app.graph._nodes) {
+				node.onAfterGraphConfigured?.();
+			}
+
+			return r;
+		};
+	}
+
 	/**
 	 * Loads all extensions from the API into the window in parallel
 	 */
@@ -1147,8 +1248,12 @@ export class ComfyApp {
 
 		this.#addProcessMouseHandler();
 		this.#addProcessKeyHandler();
+		this.#addConfigureHandler();
 
 		this.graph = new LGraph();
+
+		this.#addAfterConfigureHandler();
+
 		const canvas = (this.canvas = new LGraphCanvas(canvasEl, this.graph));
 		this.ctx = canvasEl.getContext("2d");
 
@@ -1268,7 +1373,8 @@ export class ComfyApp {
 					}
 
 					for (const o in nodeData["output"]) {
-						const output = nodeData["output"][o];
+						let output = nodeData["output"][o];
+						if(output instanceof Array) output = "COMBO";
 						const outputName = nodeData["output_name"][o] || output;
 						const outputShape = nodeData["output_is_list"][o] ? LiteGraph.GRID_SHAPE : LiteGraph.CIRCLE_SHAPE ;
 						this.addOutput(outputName, output, { shape: outputShape });
@@ -1285,6 +1391,7 @@ export class ComfyApp {
 				{
 					title: nodeData.display_name || nodeData.name,
 					comfyClass: nodeData.name,
+					nodeData
 				}
 			);
 			node.prototype.comfyClass = nodeData.name;
@@ -1322,6 +1429,7 @@ export class ComfyApp {
 		for (let n of graphData.nodes) {
 			// Patch T2IAdapterLoader to ControlNetLoader since they are the same node now
 			if (n.type == "T2IAdapterLoader") n.type = "ControlNetLoader";
+			if (n.type == "ConditioningAverage ") n.type = "ConditioningAverage"; //typo fix
 
 			// Find missing node types
 			if (!(n.type in LiteGraph.registered_node_types)) {
@@ -1669,13 +1777,21 @@ export class ComfyApp {
 	async refreshComboInNodes() {
 		const defs = await api.getNodeDefs();
 
+		for(const nodeId in LiteGraph.registered_node_types) {
+			const node = LiteGraph.registered_node_types[nodeId];
+			const nodeDef = defs[nodeId];
+			if(!nodeDef) continue;
+
+			node.nodeData = nodeDef;
+		}
+
 		for(let nodeNum in this.graph._nodes) {
 			const node = this.graph._nodes[nodeNum];
-
 			const def = defs[node.type];
 
-			// HOTFIX: The current patch is designed to prevent the rest of the code from breaking due to primitive nodes,
-			//         and additional work is needed to consider the primitive logic in the refresh logic.
+			// Allow primitive nodes to handle refresh
+			node.refreshComboInNode?.(defs);
+
 			if(!def)
 				continue;