From ee8f8ee07fb141e5a5ce3abf602ed0fa2e50cf7b Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 09:35:19 -0400 Subject: [PATCH 1/6] Fix regression with ddim and uni_pc when batch size > 1. --- comfy/k_diffusion/external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/k_diffusion/external.py b/comfy/k_diffusion/external.py index 680a3568c..7335d56c4 100644 --- a/comfy/k_diffusion/external.py +++ b/comfy/k_diffusion/external.py @@ -92,8 +92,8 @@ class DiscreteSchedule(nn.Module): def predict_eps_discrete_timestep(self, input, t, **kwargs): sigma = self.t_to_sigma(t.round()) - input = input * ((sigma ** 2 + 1.0) ** 0.5) - return (input - self(input, sigma, **kwargs)) / sigma + input = input * ((utils.append_dims(sigma, input.ndim) ** 2 + 1.0) ** 0.5) + return (input - self(input, sigma, **kwargs)) / utils.append_dims(sigma, input.ndim) class DiscreteEpsDDPMDenoiser(DiscreteSchedule): """A wrapper for discrete schedule DDPM models that output eps (the predicted From 3a150bad1590d6e23cfcf3e621d575dba1ca8c2a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 10:11:08 -0400 Subject: [PATCH 2/6] Only calculate randn in some samplers when it's actually being used. --- comfy/k_diffusion/sampling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 4cc2534f3..3b4e99315 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -131,9 +131,9 @@ def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_in = x.new_ones([x.shape[0]]) for i in trange(len(sigmas) - 1, disable=disable): gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. - eps = torch.randn_like(x) * s_noise sigma_hat = sigmas[i] * (gamma + 1) if gamma > 0: + eps = torch.randn_like(x) * s_noise x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 denoised = model(x, sigma_hat * s_in, **extra_args) d = to_d(x, sigma_hat, denoised) @@ -172,9 +172,9 @@ def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_in = x.new_ones([x.shape[0]]) for i in trange(len(sigmas) - 1, disable=disable): gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. - eps = torch.randn_like(x) * s_noise sigma_hat = sigmas[i] * (gamma + 1) if gamma > 0: + eps = torch.randn_like(x) * s_noise x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 denoised = model(x, sigma_hat * s_in, **extra_args) d = to_d(x, sigma_hat, denoised) @@ -201,9 +201,9 @@ def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_in = x.new_ones([x.shape[0]]) for i in trange(len(sigmas) - 1, disable=disable): gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. - eps = torch.randn_like(x) * s_noise sigma_hat = sigmas[i] * (gamma + 1) if gamma > 0: + eps = torch.randn_like(x) * s_noise x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 denoised = model(x, sigma_hat * s_in, **extra_args) d = to_d(x, sigma_hat, denoised) From 1679abd86d944521cad8a94a09d30fd5e238ae22 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 11:00:14 -0400 Subject: [PATCH 3/6] Add a command line argument to enable backend:cudaMallocAsync --- comfy/cli_args.py | 1 + comfy/model_management.py | 2 +- main.py | 10 +++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index bef1868b9..9a388b03b 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -40,6 +40,7 @@ parser.add_argument("--extra-model-paths-config", type=str, default=None, metava parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.") +parser.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync.") parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.") fp_group = parser.add_mutually_exclusive_group() diff --git a/comfy/model_management.py b/comfy/model_management.py index 92c8ac842..69542cc37 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -204,7 +204,7 @@ print(f"Set vram state to: {vram_state.name}") def get_torch_device_name(device): if hasattr(device, 'type'): if device.type == "cuda": - return "{} {}".format(device, torch.cuda.get_device_name(device)) + return "{} {} : {}".format(device, torch.cuda.get_device_name(device), torch.cuda.get_allocator_backend()) else: return "{}".format(device.type) else: diff --git a/main.py b/main.py index 802e4bfe4..a22545573 100644 --- a/main.py +++ b/main.py @@ -51,7 +51,6 @@ import threading import gc from comfy.cli_args import args -import comfy.utils if os.name == "nt": import logging @@ -62,7 +61,16 @@ if __name__ == "__main__": os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) print("Set cuda device to:", args.cuda_device) + if args.cuda_malloc: + env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None) + if env_var is None: + env_var = "backend:cudaMallocAsync" + else: + env_var += ",backend:cudaMallocAsync" + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var + +import comfy.utils import yaml import execution From 55d0fca9fa7f7c5a82a3c5fa8993d52f3d1902e5 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 14:10:36 -0400 Subject: [PATCH 4/6] --windows-standalone-build now enables --cuda-malloc --- comfy/cli_args.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 9a388b03b..e7ce256b7 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -85,3 +85,4 @@ args = parser.parse_args() if args.windows_standalone_build: args.auto_launch = True + args.cuda_malloc = True #work around memory issue in nvidia drivers > 531 From 9871a15cf9cccb6b477ff86964209de1fc174007 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 14:40:29 -0400 Subject: [PATCH 5/6] Enable --cuda-malloc by default on torch 2.0 and up. Add --disable-cuda-malloc to disable it. --- comfy/cli_args.py | 6 ++++-- main.py | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index e7ce256b7..29e5fb159 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -40,7 +40,10 @@ parser.add_argument("--extra-model-paths-config", type=str, default=None, metava parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.") -parser.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync.") +cm_group = parser.add_mutually_exclusive_group() +cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") +cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Enable cudaMallocAsync.") + parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.") fp_group = parser.add_mutually_exclusive_group() @@ -85,4 +88,3 @@ args = parser.parse_args() if args.windows_standalone_build: args.auto_launch = True - args.cuda_malloc = True #work around memory issue in nvidia drivers > 531 diff --git a/main.py b/main.py index a22545573..61ba9e8e6 100644 --- a/main.py +++ b/main.py @@ -61,7 +61,23 @@ if __name__ == "__main__": os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) print("Set cuda device to:", args.cuda_device) - if args.cuda_malloc: + if not args.cuda_malloc: + try: #if there's a better way to check the torch version without importing it let me know + version = "" + torch_spec = importlib.util.find_spec("torch") + for folder in torch_spec.submodule_search_locations: + ver_file = os.path.join(folder, "version.py") + if os.path.isfile(ver_file): + spec = importlib.util.spec_from_file_location("torch_version_import", ver_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + version = module.__version__ + if int(version[0]) >= 2: #enable by default for torch version 2.0 and up + args.cuda_malloc = True + except: + pass + + if args.cuda_malloc and not args.disable_cuda_malloc: env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None) if env_var is None: env_var = "backend:cudaMallocAsync" From ff6b047a74745f93d528df19cc3b40c6f185808c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Jul 2023 15:18:58 -0400 Subject: [PATCH 6/6] Fix device print on old torch version. --- comfy/model_management.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 69542cc37..34d22429a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -204,7 +204,11 @@ print(f"Set vram state to: {vram_state.name}") def get_torch_device_name(device): if hasattr(device, 'type'): if device.type == "cuda": - return "{} {} : {}".format(device, torch.cuda.get_device_name(device), torch.cuda.get_allocator_backend()) + try: + allocator_backend = torch.cuda.get_allocator_backend() + except: + allocator_backend = "" + return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend) else: return "{}".format(device.type) else: