From 2c4e0b49b728f01e58be52edcdb4dcca83e25b87 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 2 Jul 2023 09:37:31 -0400 Subject: [PATCH 1/4] Switch to fp16 on some cards when the model is too big. --- comfy/model_management.py | 25 ++++++++++++++++++++++--- comfy/sd.py | 9 ++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 5461d28e4..74f3dadba 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -245,6 +245,8 @@ def unload_model(): n.cpu() current_gpu_controlnets = [] +def minimum_inference_memory(): + return (768 * 1024 * 1024) def load_model_gpu(model): global current_loaded_model @@ -272,7 +274,7 @@ def load_model_gpu(model): model_size = model.model_size() current_free_mem = get_free_memory(torch_dev) lowvram_model_memory = int(max(256 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 )) - if model_size > (current_free_mem - (512 * 1024 * 1024)): #only switch to lowvram if really necessary + if model_size > (current_free_mem - minimum_inference_memory()): #only switch to lowvram if really necessary vram_set_state = VRAMState.LOW_VRAM current_loaded_model = model @@ -458,7 +460,7 @@ def is_device_cpu(device): return True return False -def should_use_fp16(device=None): +def should_use_fp16(device=None, model_params=0): global xpu_available global directml_enabled @@ -482,10 +484,27 @@ def should_use_fp16(device=None): return True props = torch.cuda.get_device_properties("cuda") + if props.major < 6: + return False + + fp16_works = False + #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled + #when the model doesn't actually fit on the card + #TODO: actually test if GP106 and others have the same type of behavior + nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050"] + for x in nvidia_10_series: + if x in props.name.lower(): + fp16_works = True + + if fp16_works: + free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory()) + if model_params * 4 > free_model_memory: + return True + if props.major < 7: return False - #FP32 is faster on those cards? + #FP16 is just broken on these cards nvidia_16_series = ["1660", "1650", "1630", "T500", "T550", "T600"] for x in nvidia_16_series: if x in props.name: diff --git a/comfy/sd.py b/comfy/sd.py index 3d79c7c04..fc3551fea 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1122,6 +1122,12 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl return (ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=offload_device), clip, vae) +def calculate_parameters(sd, prefix): + params = 0 + for k in sd.keys(): + if k.startswith(prefix): + params += sd[k].nelement() + return params def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None): sd = utils.load_torch_file(ckpt_path) @@ -1132,7 +1138,8 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o model = None clip_target = None - fp16 = model_management.should_use_fp16() + parameters = calculate_parameters(sd, "model.diffusion_model.") + fp16 = model_management.should_use_fp16(model_params=parameters) class WeightsLoader(torch.nn.Module): pass From ae948b42fa494256ae4da46a06fee200e173d4ec Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 2 Jul 2023 11:47:30 -0400 Subject: [PATCH 2/4] Add taesd weights to standalones. --- .github/workflows/windows_release_cu118_package.yml | 2 ++ .github/workflows/windows_release_nightly_pytorch.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/windows_release_cu118_package.yml b/.github/workflows/windows_release_cu118_package.yml index 2d6048a23..022fdc2f9 100644 --- a/.github/workflows/windows_release_cu118_package.yml +++ b/.github/workflows/windows_release_cu118_package.yml @@ -45,6 +45,8 @@ jobs: sed -i '1i../ComfyUI' ./python310._pth cd .. + git clone https://github.com/comfyanonymous/taesd + cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/ mkdir ComfyUI_windows_portable mv python_embeded ComfyUI_windows_portable diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml index dde50a73f..c7843d402 100644 --- a/.github/workflows/windows_release_nightly_pytorch.yml +++ b/.github/workflows/windows_release_nightly_pytorch.yml @@ -37,6 +37,8 @@ jobs: sed -i '1i../ComfyUI' ./python311._pth cd .. + git clone https://github.com/comfyanonymous/taesd + cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/ mkdir ComfyUI_windows_portable_nightly_pytorch mv python_embeded ComfyUI_windows_portable_nightly_pytorch From 103c487a897bcf82cc8add4035f38ac920a7f150 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sun, 2 Jul 2023 11:57:36 -0400 Subject: [PATCH 3/4] Cleanup. --- comfy/ldm/modules/attention.py | 11 +++++++---- main.py | 4 ---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 0c54f7f47..5f9eaa6eb 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -16,11 +16,14 @@ if model_management.xformers_enabled(): import xformers import xformers.ops -# CrossAttn precision handling -import os -_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32") - from comfy.cli_args import args +# CrossAttn precision handling +if args.dont_upcast_attention: + print("disabling upcasting of attention") + _ATTN_PRECISION = "fp16" +else: + _ATTN_PRECISION = "fp32" + def exists(val): return val is not None diff --git a/main.py b/main.py index 22425d2aa..715649975 100644 --- a/main.py +++ b/main.py @@ -14,10 +14,6 @@ if os.name == "nt": logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": - if args.dont_upcast_attention: - print("disabling upcasting of attention") - os.environ['ATTN_PRECISION'] = "fp16" - if args.cuda_device is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) print("Set cuda device to:", args.cuda_device) From dc9d1f31c853503f238a2d6eacd0e4c68d9e9e42 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 3 Jul 2023 00:08:30 -0400 Subject: [PATCH 4/4] Improvements for OSX. --- comfy/model_management.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 74f3dadba..dcfd57b57 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -334,19 +334,19 @@ def unload_if_low_vram(model): return model def unet_offload_device(): - if vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.SHARED: + if vram_state == VRAMState.HIGH_VRAM: return get_torch_device() else: return torch.device("cpu") def text_encoder_offload_device(): - if args.gpu_only or vram_state == VRAMState.SHARED: + if args.gpu_only: return get_torch_device() else: return torch.device("cpu") def text_encoder_device(): - if args.gpu_only or vram_state == VRAMState.SHARED: + if args.gpu_only: return get_torch_device() elif vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.NORMAL_VRAM: if torch.get_num_threads() < 8: #leaving the text encoder on the CPU is faster than shifting it if the CPU is fast enough. @@ -360,7 +360,7 @@ def vae_device(): return get_torch_device() def vae_offload_device(): - if args.gpu_only or vram_state == VRAMState.SHARED: + if args.gpu_only: return get_torch_device() else: return torch.device("cpu")