Merge with upstream

2026-03-15 06:07:33 +08:00 · 2024-03-08 15:17:20 -08:00 · 2024-03-08 15:17:20 -08:00 · c0d9bc0129
commit c0d9bc0129
parent 148d57a772 55f37baae8
20 changed files with 326 additions and 315 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@ -104,7 +104,7 @@ if self_update and not files_equal(update_py_path, repo_update_py_path) and file
 if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
    import subprocess
    try:
-        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', repo_req_path])
+        subprocess.check_call([sys.executable, '-s', '-m', 'pip', 'install', '-r', repo_req_path])
        shutil.copy(repo_req_path, req_path)
    except:
        pass
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@ -24,7 +24,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "6"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "1"
+        default: "2"
 #  push:
 #    branches:
 #      - master
@ -49,7 +49,7 @@ jobs:
            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
            ./python.exe get-pip.py
-            python -m pip wheel torch torchvision --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision mpmath==1.3.0 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "6"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/comfy/cmd/cuda_malloc.py
+++ b/comfy/cmd/cuda_malloc.py
@ -1,6 +1,7 @@
 import os
 import importlib.util
 from ..cli_args import args
 import subprocess
 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
 def get_gpu_names():
@ -34,7 +35,15 @@ def get_gpu_names():
            return gpu_names
        return enum_display_devices()
    else:
-        return set()
+        gpu_names = set()
        try:
            out = subprocess.check_output(['nvidia-smi', '-L'])
            for l in out.split(b'\n'):
                if len(l) > 0:
                    gpu_names.add(l.decode('utf-8').split(' (UUID')[0])
        except IOError as error:
            pass
        return gpu_names
 blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
                "GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620",
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -10,6 +10,7 @@ from . import ops
 from .cldm import cldm
 from .t2i_adapter import adapter
 from .ldm.cascade import controlnet
 def broadcast_image_to(tensor, target_batch_size, batched_number):
@ -38,6 +39,8 @@ class ControlBase:
        self.timestep_percent_range = (0.0, 1.0)
        self.global_average_pooling = False
        self.timestep_range = None
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        if device is None:
            device = model_management.get_torch_device()
@ -78,6 +81,8 @@ class ControlBase:
        c.strength = self.strength
        c.timestep_percent_range = self.timestep_percent_range
        c.global_average_pooling = self.global_average_pooling
        c.compression_ratio = self.compression_ratio
        c.upscale_algorithm = self.upscale_algorithm
    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@ -159,11 +164,11 @@ class ControlNet(ControlBase):
            dtype = self.manual_cast_dtype
        output_dtype = x_noisy.dtype
-        if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
            if self.cond_hint is not None:
                del self.cond_hint
            self.cond_hint = None
-            self.cond_hint = utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * 8, x_noisy.shape[2] * 8, 'nearest-exact', "center").to(dtype).to(self.device)
+            self.cond_hint = utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio, self.upscale_algorithm, "center").to(dtype).to(self.device)
        if x_noisy.shape[0] != self.cond_hint.shape[0]:
            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
@ -288,13 +293,13 @@ class ControlLora(ControlNet):
        for k in sd:
            weight = sd[k]
            try:
-                utils.set_attr(self.control_model, k, weight)
+                utils.set_attr_param(self.control_model, k, weight)
            except:
                pass
        for k in self.control_weights:
            if k not in {"lora_controlnet"}:
-                utils.set_attr(self.control_model, k, self.control_weights[k].to(dtype).to(model_management.get_torch_device()))
+                utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(model_management.get_torch_device()))
    def copy(self):
        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
@ -433,11 +438,13 @@ def load_controlnet(ckpt_path, model=None):
    return control
 class T2IAdapter(ControlBase):
-    def __init__(self, t2i_model, channels_in, device=None):
+    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
        super().__init__(device)
        self.t2i_model = t2i_model
        self.channels_in = channels_in
        self.control_input = None
        self.compression_ratio = compression_ratio
        self.upscale_algorithm = upscale_algorithm
    def scale_image_to(self, width, height):
        unshuffle_amount = self.t2i_model.unshuffle_amount
@ -457,13 +464,13 @@ class T2IAdapter(ControlBase):
                else:
                    return None
-        if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
            if self.cond_hint is not None:
                del self.cond_hint
            self.control_input = None
            self.cond_hint = None
-            width, height = self.scale_image_to(x_noisy.shape[3] * 8, x_noisy.shape[2] * 8)
+            width, height = self.scale_image_to(x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio)
-            self.cond_hint = utils.common_upscale(self.cond_hint_original, width, height, 'nearest-exact', "center").float().to(self.device)
+            self.cond_hint = utils.common_upscale(self.cond_hint_original, width, height, self.upscale_algorithm, "center").float().to(self.device)
            if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
                self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
        if x_noisy.shape[0] != self.cond_hint.shape[0]:
@ -482,11 +489,14 @@ class T2IAdapter(ControlBase):
        return self.control_merge(control_input, mid, control_prev, x_noisy.dtype)
    def copy(self):
-        c = T2IAdapter(self.t2i_model, self.channels_in)
+        c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm)
        self.copy_to(c)
        return c
 def load_t2i_adapter(t2i_data):
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'
    if 'adapter' in t2i_data:
        t2i_data = t2i_data['adapter']
    if 'adapter.body.0.resnets.0.block1.weight' in t2i_data: #diffusers format
@ -514,8 +524,17 @@ def load_t2i_adapter(t2i_data):
        if cin == 256 or cin == 768:
            xl = True
        model_ad = adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
    elif "backbone.0.0.weight" in keys:
        model_ad = controlnet.ControlNet(c_in=t2i_data['backbone.0.0.weight'].shape[1], proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
        compression_ratio = 32
        upscale_algorithm = 'bilinear'
    elif "backbone.10.blocks.0.weight" in keys:
        model_ad = controlnet.ControlNet(c_in=t2i_data['backbone.0.weight'].shape[1], bottleneck_mode="large", proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
        compression_ratio = 1
        upscale_algorithm = 'nearest-exact'
    else:
        return None
    missing, unexpected = model_ad.load_state_dict(t2i_data)
    if len(missing) > 0:
        print("t2i missing", missing)
@ -523,4 +542,4 @@ def load_t2i_adapter(t2i_data):
    if len(unexpected) > 0:
        print("t2i unexpected", unexpected)
-    return T2IAdapter(model_ad, model_ad.input_channels)
+    return T2IAdapter(model_ad, model_ad.input_channels, compression_ratio, upscale_algorithm)
--- a/comfy/ldm/cascade/controlnet.py
+++ b/comfy/ldm/cascade/controlnet.py
@ -0,0 +1,93 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Stability AI
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import torch
 import torchvision
 from torch import nn
 from .common import LayerNorm2d_op
 class CNetResBlock(nn.Module):
    def __init__(self, c, dtype=None, device=None, operations=None):
        super().__init__()
        self.blocks = nn.Sequential(
            LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
            nn.GELU(),
            operations.Conv2d(c, c, kernel_size=3, padding=1),
            LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
            nn.GELU(),
            operations.Conv2d(c, c, kernel_size=3, padding=1),
        )
    def forward(self, x):
        return x + self.blocks(x)
 class ControlNet(nn.Module):
    def __init__(self, c_in=3, c_proj=2048, proj_blocks=None, bottleneck_mode=None, dtype=None, device=None, operations=nn):
        super().__init__()
        if bottleneck_mode is None:
            bottleneck_mode = 'effnet'
        self.proj_blocks = proj_blocks
        if bottleneck_mode == 'effnet':
            embd_channels = 1280
            self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
            if c_in != 3:
                in_weights = self.backbone[0][0].weight.data
                self.backbone[0][0] = operations.Conv2d(c_in, 24, kernel_size=3, stride=2, bias=False, dtype=dtype, device=device)
                if c_in > 3:
                    # nn.init.constant_(self.backbone[0][0].weight, 0)
                    self.backbone[0][0].weight.data[:, :3] = in_weights[:, :3].clone()
                else:
                    self.backbone[0][0].weight.data = in_weights[:, :c_in].clone()
        elif bottleneck_mode == 'simple':
            embd_channels = c_in
            self.backbone = nn.Sequential(
                operations.Conv2d(embd_channels, embd_channels * 4, kernel_size=3, padding=1, dtype=dtype, device=device),
                nn.LeakyReLU(0.2, inplace=True),
                operations.Conv2d(embd_channels * 4, embd_channels, kernel_size=3, padding=1, dtype=dtype, device=device),
            )
        elif bottleneck_mode == 'large':
            self.backbone = nn.Sequential(
                operations.Conv2d(c_in, 4096 * 4, kernel_size=1, dtype=dtype, device=device),
                nn.LeakyReLU(0.2, inplace=True),
                operations.Conv2d(4096 * 4, 1024, kernel_size=1, dtype=dtype, device=device),
                *[CNetResBlock(1024, dtype=dtype, device=device, operations=operations) for _ in range(8)],
                operations.Conv2d(1024, 1280, kernel_size=1, dtype=dtype, device=device),
            )
            embd_channels = 1280
        else:
            raise ValueError(f'Unknown bottleneck mode: {bottleneck_mode}')
        self.projections = nn.ModuleList()
        for _ in range(len(proj_blocks)):
            self.projections.append(nn.Sequential(
                operations.Conv2d(embd_channels, embd_channels, kernel_size=1, bias=False, dtype=dtype, device=device),
                nn.LeakyReLU(0.2, inplace=True),
                operations.Conv2d(embd_channels, c_proj, kernel_size=1, bias=False, dtype=dtype, device=device),
            ))
            # nn.init.constant_(self.projections[-1][-1].weight, 0)  # zero output projection
        self.xl = False
        self.input_channels = c_in
        self.unshuffle_amount = 8
    def forward(self, x):
        x = self.backbone(x)
        proj_outputs = [None for _ in range(max(self.proj_blocks) + 1)]
        for i, idx in enumerate(self.proj_blocks):
            proj_outputs[idx] = self.projections[i](x)
        return proj_outputs
--- a/comfy/ldm/cascade/stage_c.py
+++ b/comfy/ldm/cascade/stage_c.py
@ -194,10 +194,10 @@ class StageC(nn.Module):
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  ResBlock)):
                        if cnet is not None:
-                            next_cnet = cnet()
+                            next_cnet = cnet.pop()
                            if next_cnet is not None:
                                x = x + nn.functional.interpolate(next_cnet, size=x.shape[-2:], mode='bilinear',
-                                                                  align_corners=True)
+                                                                  align_corners=True).to(x.dtype)
                        x = block(x)
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
@ -228,10 +228,10 @@ class StageC(nn.Module):
                            x = torch.nn.functional.interpolate(x, skip.shape[-2:], mode='bilinear',
                                                                align_corners=True)
                        if cnet is not None:
-                            next_cnet = cnet()
+                            next_cnet = cnet.pop()
                            if next_cnet is not None:
                                x = x + nn.functional.interpolate(next_cnet, size=x.shape[-2:], mode='bilinear',
-                                                                  align_corners=True)
+                                                                  align_corners=True).to(x.dtype)
                        x = block(x, skip)
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
@ -248,7 +248,7 @@ class StageC(nn.Module):
            x = upscaler(x)
        return x
-    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, cnet=None, **kwargs):
+    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, **kwargs):
        # Process the conditioning embeddings
        r_embed = self.gen_r_embedding(r).to(dtype=x.dtype)
        for c in self.t_conds:
@ -256,10 +256,13 @@ class StageC(nn.Module):
            r_embed = torch.cat([r_embed, self.gen_r_embedding(t_cond).to(dtype=x.dtype)], dim=1)
        clip = self.gen_c_embeddings(clip_text, clip_text_pooled, clip_img)
        if control is not None:
            cnet = control.get("input")
        else:
            cnet = None
        # Model Blocks
        x = self.embedding(x)
        if cnet is not None:
            cnet = ControlNetDeliverer(cnet)
        level_outputs = self._down_encode(x, r_embed, clip, cnet)
        x = self._up_decode(level_outputs, r_embed, clip, cnet)
        return self.clf(x)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -166,6 +166,10 @@ class BaseModel(torch.nn.Module):
        if cross_attn_cnet is not None:
            out['crossattn_controlnet'] = conds.CONDCrossAttn(cross_attn_cnet)
        c_concat = kwargs.get("noise_concat", None)
        if c_concat is not None:
            out['c_concat'] = comfy.conds.CONDNoiseShape(data)
        return out
    def load_model_weights(self, sd, unet_prefix=""):
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -763,7 +763,7 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
    #when the model doesn't actually fit on the card
    #TODO: actually test if GP106 and others have the same type of behavior
-    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050"]
+    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
    for x in nvidia_10_series:
        if x in props.name.lower():
            fp16_works = True
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -67,6 +67,9 @@ class ModelPatcher:
    def set_model_unet_function_wrapper(self, unet_wrapper_function):
        self.model_options["model_function_wrapper"] = unet_wrapper_function
    def set_model_denoise_mask_function(self, denoise_mask_function):
        self.model_options["denoise_mask_function"] = denoise_mask_function
    def set_model_patch(self, patch, name):
        to = self.model_options["transformer_options"]
        if "patches" not in to:
@ -176,10 +179,9 @@ class ModelPatcher:
    def patch_model(self, device_to=None, patch_weights=True):
        for k in self.object_patches:
-            old = getattr(self.model, k)
+            old = utils.set_attr(self.model, k, self.object_patches[k])
            if k not in self.object_patches_backup:
                self.object_patches_backup[k] = old
            setattr(self.model, k, self.object_patches[k])
        if patch_weights:
            model_sd = self.model_state_dict()
@ -203,7 +205,7 @@ class ModelPatcher:
                if inplace_update:
                    utils.copy_to_param(self.model, key, out_weight)
                else:
-                    utils.set_attr(self.model, key, out_weight)
+                    utils.set_attr_param(self.model, key, out_weight)
                del temp_weight
            if device_to is not None:
@ -342,7 +344,7 @@ class ModelPatcher:
                utils.copy_to_param(self.model, k, self.backup[k])
        else:
            for k in keys:
-                utils.set_attr(self.model, k, self.backup[k])
+                utils.set_attr_param(self.model, k, self.backup[k])
        self.backup = {}
@ -352,6 +354,6 @@ class ModelPatcher:
        keys = list(self.object_patches_backup.keys())
        for k in keys:
-            setattr(self.model, k, self.object_patches_backup[k])
+            utils.set_attr(self.model, k, self.object_patches_backup[k])
        self.object_patches_backup = {}
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@ -11,6 +11,14 @@ class EPS:
        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
        return model_input - model_output * sigma
    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
        if max_denoise:
            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
        else:
            noise = noise * sigma
        noise += latent_image
        return noise
 class V_PREDICTION(EPS):
    def calculate_denoised(self, sigma, model_output, model_input):
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -275,15 +275,16 @@ class CFGNoisePredictor(torch.nn.Module):
        return self.apply_model(*args, **kwargs)
 class KSamplerX0Inpaint(torch.nn.Module):
-    def __init__(self, model):
+    def __init__(self, model, sigmas):
        super().__init__()
        self.inner_model = model
        self.sigmas = sigmas
    def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, model_options={}, seed=None):
        if denoise_mask is not None:
            if "denoise_mask_function" in model_options:
-                denoise_mask = model_options["denoise_mask_function"](sigma, denoise_mask)
+                denoise_mask = model_options["denoise_mask_function"](sigma, denoise_mask, extra_options={"model": self.inner_model, "sigmas": self.sigmas})
            latent_mask = 1. - denoise_mask
-            x = x * denoise_mask + (self.latent_image + self.noise * sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1))) * latent_mask
+            x = x * denoise_mask + self.inner_model.inner_model.model_sampling.noise_scaling(sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1)), self.noise, self.latent_image) * latent_mask
        out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, model_options=model_options, seed=seed)
        if denoise_mask is not None:
            out = out * denoise_mask + self.latent_image * latent_mask
@ -531,7 +532,7 @@ class KSAMPLER(Sampler):
    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
        extra_args["denoise_mask"] = denoise_mask
-        model_k = KSamplerX0Inpaint(model_wrap)
+        model_k = KSamplerX0Inpaint(model_wrap, sigmas)
        model_k.latent_image = latent_image
        if self.inpaint_options.get("random", False): #TODO: Should this be the default?
            generator = torch.manual_seed(extra_args.get("seed", 41) + 1)
@ -539,19 +540,13 @@ class KSAMPLER(Sampler):
        else:
            model_k.noise = noise
-        if self.max_denoise(model_wrap, sigmas):
+        noise = model_wrap.inner_model.model_sampling.noise_scaling(sigmas[0], noise, latent_image, self.max_denoise(model_wrap, sigmas))
            noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0)
        else:
            noise = noise * sigmas[0]
        k_callback = None
        total_steps = len(sigmas) - 1
        if callback is not None:
            k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps)
        if latent_image is not None:
            noise += latent_image
        samples = self.sampler_function(model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar, **self.extra_options)
        return samples
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -296,8 +296,11 @@ def set_attr(obj, attr, value):
    for name in attrs[:-1]:
        obj = getattr(obj, name)
    prev = getattr(obj, attrs[-1])
-    setattr(obj, attrs[-1], torch.nn.Parameter(value, requires_grad=False))
+    setattr(obj, attrs[-1], value)
-    del prev
+    return prev
 def set_attr_param(obj, attr, value):
    return set_attr(obj, attr, torch.nn.Parameter(value, requires_grad=False))
 def copy_to_param(obj, attr, value):
    # inplace update tensor instead of replacing it
--- a/comfy_extras/nodes/nodes_canny.py
+++ b/comfy_extras/nodes/nodes_canny.py
@ -5,275 +5,7 @@ import torch
 import torch.nn.functional as F
 import comfy.model_management
-def get_canny_nms_kernel(device=None, dtype=None):
+from kornia.filters import canny
    """Utility function that returns 3x3 kernels for the Canny Non-maximal suppression."""
    return torch.tensor(
        [
            [[[0.0, 0.0, 0.0], [0.0, 1.0, -1.0], [0.0, 0.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -1.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, -1.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [-1.0, 0.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [-1.0, 1.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[0.0, -1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[0.0, 0.0, -1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]]],
        ],
        device=device,
        dtype=dtype,
    )
 def get_hysteresis_kernel(device=None, dtype=None):
    """Utility function that returns the 3x3 kernels for the Canny hysteresis."""
    return torch.tensor(
        [
            [[[0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]]],
            [[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]],
            [[[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]],
        ],
        device=device,
        dtype=dtype,
    )
 def gaussian_blur_2d(img, kernel_size, sigma):
    ksize_half = (kernel_size - 1) * 0.5
    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
    x_kernel = pdf / pdf.sum()
    x_kernel = x_kernel.to(device=img.device, dtype=img.dtype)
    kernel2d = torch.mm(x_kernel[:, None], x_kernel[None, :])
    kernel2d = kernel2d.expand(img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1])
    padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
    img = torch.nn.functional.pad(img, padding, mode="reflect")
    img = torch.nn.functional.conv2d(img, kernel2d, groups=img.shape[-3])
    return img
 def get_sobel_kernel2d(device=None, dtype=None):
    kernel_x = torch.tensor([[-1.0, 0.0, 1.0], [-2.0, 0.0, 2.0], [-1.0, 0.0, 1.0]], device=device, dtype=dtype)
    kernel_y = kernel_x.transpose(0, 1)
    return torch.stack([kernel_x, kernel_y])
 def spatial_gradient(input, normalized: bool = True):
    r"""Compute the first order image derivative in both x and y using a Sobel operator.
    .. image:: _static/img/spatial_gradient.png
    Args:
        input: input image tensor with shape :math:`(B, C, H, W)`.
        mode: derivatives modality, can be: `sobel` or `diff`.
        order: the order of the derivatives.
        normalized: whether the output is normalized.
    Return:
        the derivatives of the input feature map. with shape :math:`(B, C, 2, H, W)`.
    .. note::
       See a working example `here <https://kornia-tutorials.readthedocs.io/en/latest/
       filtering_edges.html>`__.
    Examples:
        >>> input = torch.rand(1, 3, 4, 4)
        >>> output = spatial_gradient(input)  # 1x3x2x4x4
        >>> output.shape
        torch.Size([1, 3, 2, 4, 4])
    """
    # KORNIA_CHECK_IS_TENSOR(input)
    # KORNIA_CHECK_SHAPE(input, ['B', 'C', 'H', 'W'])
    # allocate kernel
    kernel = get_sobel_kernel2d(device=input.device, dtype=input.dtype)
    if normalized:
        kernel = normalize_kernel2d(kernel)
    # prepare kernel
    b, c, h, w = input.shape
    tmp_kernel = kernel[:, None, ...]
    # Pad with "replicate for spatial dims, but with zeros for channel
    spatial_pad = [kernel.size(1) // 2, kernel.size(1) // 2, kernel.size(2) // 2, kernel.size(2) // 2]
    out_channels: int = 2
    padded_inp = torch.nn.functional.pad(input.reshape(b * c, 1, h, w), spatial_pad, 'replicate')
    out = F.conv2d(padded_inp, tmp_kernel, groups=1, padding=0, stride=1)
    return out.reshape(b, c, out_channels, h, w)
 def rgb_to_grayscale(image, rgb_weights = None):
    r"""Convert a RGB image to grayscale version of image.
    .. image:: _static/img/rgb_to_grayscale.png
    The image data is assumed to be in the range of (0, 1).
    Args:
        image: RGB image to be converted to grayscale with shape :math:`(*,3,H,W)`.
        rgb_weights: Weights that will be applied on each channel (RGB).
            The sum of the weights should add up to one.
    Returns:
        grayscale version of the image with shape :math:`(*,1,H,W)`.
    .. note::
       See a working example `here <https://kornia-tutorials.readthedocs.io/en/latest/
       color_conversions.html>`__.
    Example:
        >>> input = torch.rand(2, 3, 4, 5)
        >>> gray = rgb_to_grayscale(input) # 2x1x4x5
    """
    if len(image.shape) < 3 or image.shape[-3] != 3:
        raise ValueError(f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
    if rgb_weights is None:
        # 8 bit images
        if image.dtype == torch.uint8:
            rgb_weights = torch.tensor([76, 150, 29], device=image.device, dtype=torch.uint8)
        # floating point images
        elif image.dtype in (torch.float16, torch.float32, torch.float64):
            rgb_weights = torch.tensor([0.299, 0.587, 0.114], device=image.device, dtype=image.dtype)
        else:
            raise TypeError(f"Unknown data type: {image.dtype}")
    else:
        # is tensor that we make sure is in the same device/dtype
        rgb_weights = rgb_weights.to(image)
    # unpack the color image channels with RGB order
    r: Tensor = image[..., 0:1, :, :]
    g: Tensor = image[..., 1:2, :, :]
    b: Tensor = image[..., 2:3, :, :]
    w_r, w_g, w_b = rgb_weights.unbind()
    return w_r * r + w_g * g + w_b * b
 def canny(
    input,
    low_threshold = 0.1,
    high_threshold = 0.2,
    kernel_size  = 5,
    sigma = 1,
    hysteresis = True,
    eps = 1e-6,
 ):
    r"""Find edges of the input image and filters them using the Canny algorithm.
    .. image:: _static/img/canny.png
    Args:
        input: input image tensor with shape :math:`(B,C,H,W)`.
        low_threshold: lower threshold for the hysteresis procedure.
        high_threshold: upper threshold for the hysteresis procedure.
        kernel_size: the size of the kernel for the gaussian blur.
        sigma: the standard deviation of the kernel for the gaussian blur.
        hysteresis: if True, applies the hysteresis edge tracking.
            Otherwise, the edges are divided between weak (0.5) and strong (1) edges.
        eps: regularization number to avoid NaN during backprop.
    Returns:
        - the canny edge magnitudes map, shape of :math:`(B,1,H,W)`.
        - the canny edge detection filtered by thresholds and hysteresis, shape of :math:`(B,1,H,W)`.
    .. note::
       See a working example `here <https://kornia-tutorials.readthedocs.io/en/latest/
       canny.html>`__.
    Example:
        >>> input = torch.rand(5, 3, 4, 4)
        >>> magnitude, edges = canny(input)  # 5x3x4x4
        >>> magnitude.shape
        torch.Size([5, 1, 4, 4])
        >>> edges.shape
        torch.Size([5, 1, 4, 4])
    """
    # KORNIA_CHECK_IS_TENSOR(input)
    # KORNIA_CHECK_SHAPE(input, ['B', 'C', 'H', 'W'])
    # KORNIA_CHECK(
    #     low_threshold <= high_threshold,
    #     "Invalid input thresholds. low_threshold should be smaller than the high_threshold. Got: "
    #     f"{low_threshold}>{high_threshold}",
    # )
    # KORNIA_CHECK(0 < low_threshold < 1, f'Invalid low threshold. Should be in range (0, 1). Got: {low_threshold}')
    # KORNIA_CHECK(0 < high_threshold < 1, f'Invalid high threshold. Should be in range (0, 1). Got: {high_threshold}')
    device = input.device
    dtype = input.dtype
    # To Grayscale
    if input.shape[1] == 3:
        input = rgb_to_grayscale(input)
    # Gaussian filter
    blurred: Tensor = gaussian_blur_2d(input, kernel_size, sigma)
    # Compute the gradients
    gradients: Tensor = spatial_gradient(blurred, normalized=False)
    # Unpack the edges
    gx: Tensor = gradients[:, :, 0]
    gy: Tensor = gradients[:, :, 1]
    # Compute gradient magnitude and angle
    magnitude: Tensor = torch.sqrt(gx * gx + gy * gy + eps)
    angle: Tensor = torch.atan2(gy, gx)
    # Radians to Degrees
    angle = 180.0 * angle / math.pi
    # Round angle to the nearest 45 degree
    angle = torch.round(angle / 45) * 45
    # Non-maximal suppression
    nms_kernels: Tensor = get_canny_nms_kernel(device, dtype)
    nms_magnitude: Tensor = F.conv2d(magnitude, nms_kernels, padding=nms_kernels.shape[-1] // 2)
    # Get the indices for both directions
    positive_idx: Tensor = (angle / 45) % 8
    positive_idx = positive_idx.long()
    negative_idx: Tensor = ((angle / 45) + 4) % 8
    negative_idx = negative_idx.long()
    # Apply the non-maximum suppression to the different directions
    channel_select_filtered_positive: Tensor = torch.gather(nms_magnitude, 1, positive_idx)
    channel_select_filtered_negative: Tensor = torch.gather(nms_magnitude, 1, negative_idx)
    channel_select_filtered: Tensor = torch.stack(
        [channel_select_filtered_positive, channel_select_filtered_negative], 1
    )
    is_max: Tensor = channel_select_filtered.min(dim=1)[0] > 0.0
    magnitude = magnitude * is_max
    # Threshold
    edges: Tensor = F.threshold(magnitude, low_threshold, 0.0)
    low: Tensor = magnitude > low_threshold
    high: Tensor = magnitude > high_threshold
    edges = low * 0.5 + high * 0.5
    edges = edges.to(dtype)
    # Hysteresis
    if hysteresis:
        edges_old: Tensor = -torch.ones(edges.shape, device=edges.device, dtype=dtype)
        hysteresis_kernels: Tensor = get_hysteresis_kernel(device, dtype)
        while ((edges_old - edges).abs() != 0).any():
            weak: Tensor = (edges == 0.5).float()
            strong: Tensor = (edges == 1).float()
            hysteresis_magnitude: Tensor = F.conv2d(
                edges, hysteresis_kernels, padding=hysteresis_kernels.shape[-1] // 2
            )
            hysteresis_magnitude = (hysteresis_magnitude == 1).any(1, keepdim=True).to(dtype)
            hysteresis_magnitude = hysteresis_magnitude * weak + strong
            edges_old = edges.clone()
            edges = hysteresis_magnitude + (hysteresis_magnitude == 0) * weak * 0.5
        edges = hysteresis_magnitude
    return magnitude, edges
 class Canny:
--- a/comfy_extras/nodes/nodes_mask.py
+++ b/comfy_extras/nodes/nodes_mask.py
@ -342,6 +342,24 @@ class GrowMask:
            out.append(output)
        return (torch.stack(out, dim=0),)
 class ThresholdMask:
    @classmethod
    def INPUT_TYPES(s):
        return {
                "required": {
                    "mask": ("MASK",),
                    "value": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}),
                }
        }
    CATEGORY = "mask"
    RETURN_TYPES = ("MASK",)
    FUNCTION = "image_to_mask"
    def image_to_mask(self, mask, value):
        mask = (mask > value).float()
        return (mask,)
 NODE_CLASS_MAPPINGS = {
@ -356,6 +374,7 @@ NODE_CLASS_MAPPINGS = {
    "MaskComposite": MaskComposite,
    "FeatherMask": FeatherMask,
    "GrowMask": GrowMask,
    "ThresholdMask": ThresholdMask,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
--- a/comfy_extras/nodes/nodes_stable_cascade.py
+++ b/comfy_extras/nodes/nodes_stable_cascade.py
@ -37,7 +37,7 @@ class StableCascade_EmptyLatentImage:
    RETURN_NAMES = ("stage_c", "stage_b")
    FUNCTION = "generate"
-    CATEGORY = "_for_testing/stable_cascade"
+    CATEGORY = "latent/stable_cascade"
    def generate(self, width, height, compression, batch_size=1):
        c_latent = torch.zeros([batch_size, 16, height // compression, width // compression])
@ -63,7 +63,7 @@ class StableCascade_StageC_VAEEncode:
    RETURN_NAMES = ("stage_c", "stage_b")
    FUNCTION = "generate"
-    CATEGORY = "_for_testing/stable_cascade"
+    CATEGORY = "latent/stable_cascade"
    def generate(self, image, vae, compression):
        width = image.shape[-2]
@ -91,7 +91,7 @@ class StableCascade_StageB_Conditioning:
    FUNCTION = "set_prior"
-    CATEGORY = "_for_testing/stable_cascade"
+    CATEGORY = "conditioning/stable_cascade"
    def set_prior(self, conditioning, stage_c):
        c = []
@ -102,8 +102,39 @@ class StableCascade_StageB_Conditioning:
            c.append(n)
        return (c, )
 class StableCascade_SuperResolutionControlnet:
    def __init__(self, device="cpu"):
        self.device = device
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "image": ("IMAGE",),
            "vae": ("VAE", ),
        }}
    RETURN_TYPES = ("IMAGE", "LATENT", "LATENT")
    RETURN_NAMES = ("controlnet_input", "stage_c", "stage_b")
    FUNCTION = "generate"
    CATEGORY = "_for_testing/stable_cascade"
    def generate(self, image, vae):
        width = image.shape[-2]
        height = image.shape[-3]
        batch_size = image.shape[0]
        controlnet_input = vae.encode(image[:,:,:,:3]).movedim(1, -1)
        c_latent = torch.zeros([batch_size, 16, height // 16, width // 16])
        b_latent = torch.zeros([batch_size, 4, height // 2, width // 2])
        return (controlnet_input, {
            "samples": c_latent,
        }, {
            "samples": b_latent,
        })
 NODE_CLASS_MAPPINGS = {
    "StableCascade_EmptyLatentImage": StableCascade_EmptyLatentImage,
    "StableCascade_StageB_Conditioning": StableCascade_StageB_Conditioning,
    "StableCascade_StageC_VAEEncode": StableCascade_StageC_VAEEncode,
    "StableCascade_SuperResolutionControlnet": StableCascade_SuperResolutionControlnet,
 }
--- a/comfy_extras/nodes_differential_diffusion.py
+++ b/comfy_extras/nodes_differential_diffusion.py
@ -0,0 +1,42 @@
 # code adapted from https://github.com/exx8/differential-diffusion
 import torch
 class DifferentialDiffusion():
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"model": ("MODEL", ),
                            }}
    RETURN_TYPES = ("MODEL",)
    FUNCTION = "apply"
    CATEGORY = "_for_testing"
    INIT = False
    def apply(self, model):
        model = model.clone()
        model.set_model_denoise_mask_function(self.forward)
        return (model,)
    def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict):
        model = extra_options["model"]
        step_sigmas = extra_options["sigmas"]
        sigma_to = model.inner_model.model_sampling.sigma_min
        if step_sigmas[-1] > sigma_to:
            sigma_to = step_sigmas[-1]
        sigma_from = step_sigmas[0]
        ts_from = model.inner_model.model_sampling.timestep(sigma_from)
        ts_to = model.inner_model.model_sampling.timestep(sigma_to)
        current_ts = model.inner_model.model_sampling.timestep(sigma[0])
        threshold = (current_ts - ts_to) / (ts_from - ts_to)
        return (denoise_mask >= threshold).to(denoise_mask.dtype)
 NODE_CLASS_MAPPINGS = {
    "DifferentialDiffusion": DifferentialDiffusion,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
    "DifferentialDiffusion": "Differential Diffusion",
 }
--- a/comfy_extras/nodes_morphology.py
+++ b/comfy_extras/nodes_morphology.py
@ -0,0 +1,49 @@
 import torch
 import comfy.model_management
 from kornia.morphology import dilation, erosion, opening, closing, gradient, top_hat, bottom_hat
 class Morphology:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"image": ("IMAGE",),
                                "operation": (["erode",  "dilate", "open", "close", "gradient", "bottom_hat", "top_hat"],),
                                "kernel_size": ("INT", {"default": 3, "min": 3, "max": 999, "step": 1}),
                                }}
    RETURN_TYPES = ("IMAGE",)
    FUNCTION = "process"
    CATEGORY = "image/postprocessing"
    def process(self, image, operation, kernel_size):
        device = comfy.model_management.get_torch_device()
        kernel = torch.ones(kernel_size, kernel_size, device=device)
        image_k = image.to(device).movedim(-1, 1)
        if operation == "erode":
            output = erosion(image_k, kernel)
        elif operation == "dilate":
            output = dilation(image_k, kernel)
        elif operation == "open":
            output = opening(image_k, kernel)
        elif operation == "close":
            output = closing(image_k, kernel)
        elif operation == "gradient":
            output = gradient(image_k, kernel)
        elif operation == "top_hat":
            output = top_hat(image_k, kernel)
        elif operation == "bottom_hat":
            output = bottom_hat(image_k, kernel)
        else:
            raise ValueError(f"Invalid operation {operation} for morphology. Must be one of 'erode', 'dilate', 'open', 'close', 'gradient', 'tophat', 'bottomhat'")
        img_out = output.to(comfy.model_management.intermediate_device()).movedim(1, -1)
        return (img_out,)
 NODE_CLASS_MAPPINGS = {
    "Morphology": Morphology,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
    "Morphology": "ImageMorphology",
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -28,4 +28,6 @@ protobuf
 psutil
 ConfigArgParse
 aio-pika
-pyjwt[crypto]
+pyjwt[crypto]
 kornia>=0.7.1
 mpmath>=1.0,!=1.4.0a0