From 2c4e0b49b728f01e58be52edcdb4dcca83e25b87 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Sun, 2 Jul 2023 09:37:31 -0400
Subject: [PATCH 1/4] Switch to fp16 on some cards when the model is too big.

---
 comfy/model_management.py | 25 ++++++++++++++++++++++---
 comfy/sd.py               |  9 ++++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 5461d28e4..74f3dadba 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -245,6 +245,8 @@ def unload_model():
                 n.cpu()
             current_gpu_controlnets = []
 
+def minimum_inference_memory():
+    return (768 * 1024 * 1024)
 
 def load_model_gpu(model):
     global current_loaded_model
@@ -272,7 +274,7 @@ def load_model_gpu(model):
         model_size = model.model_size()
         current_free_mem = get_free_memory(torch_dev)
         lowvram_model_memory = int(max(256 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 ))
-        if model_size > (current_free_mem - (512 * 1024 * 1024)): #only switch to lowvram if really necessary
+        if model_size > (current_free_mem - minimum_inference_memory()): #only switch to lowvram if really necessary
             vram_set_state = VRAMState.LOW_VRAM
 
     current_loaded_model = model
@@ -458,7 +460,7 @@ def is_device_cpu(device):
             return True
     return False
 
-def should_use_fp16(device=None):
+def should_use_fp16(device=None, model_params=0):
     global xpu_available
     global directml_enabled
 
@@ -482,10 +484,27 @@ def should_use_fp16(device=None):
         return True
 
     props = torch.cuda.get_device_properties("cuda")
+    if props.major < 6:
+        return False
+
+    fp16_works = False
+    #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
+    #when the model doesn't actually fit on the card
+    #TODO: actually test if GP106 and others have the same type of behavior
+    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050"]
+    for x in nvidia_10_series:
+        if x in props.name.lower():
+            fp16_works = True
+
+    if fp16_works:
+        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        if model_params * 4 > free_model_memory:
+            return True
+
     if props.major < 7:
         return False
 
-    #FP32 is faster on those cards?
+    #FP16 is just broken on these cards
     nvidia_16_series = ["1660", "1650", "1630", "T500", "T550", "T600"]
     for x in nvidia_16_series:
         if x in props.name:
diff --git a/comfy/sd.py b/comfy/sd.py
index 3d79c7c04..fc3551fea 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1122,6 +1122,12 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
 
     return (ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=offload_device), clip, vae)
 
+def calculate_parameters(sd, prefix):
+    params = 0
+    for k in sd.keys():
+        if k.startswith(prefix):
+            params += sd[k].nelement()
+    return params
 
 def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None):
     sd = utils.load_torch_file(ckpt_path)
@@ -1132,7 +1138,8 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
     model = None
     clip_target = None
 
-    fp16 = model_management.should_use_fp16()
+    parameters = calculate_parameters(sd, "model.diffusion_model.")
+    fp16 = model_management.should_use_fp16(model_params=parameters)
 
     class WeightsLoader(torch.nn.Module):
         pass

From ae948b42fa494256ae4da46a06fee200e173d4ec Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Sun, 2 Jul 2023 11:47:30 -0400
Subject: [PATCH 2/4] Add taesd weights to standalones.

---
 .github/workflows/windows_release_cu118_package.yml   | 2 ++
 .github/workflows/windows_release_nightly_pytorch.yml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/windows_release_cu118_package.yml b/.github/workflows/windows_release_cu118_package.yml
index 2d6048a23..022fdc2f9 100644
--- a/.github/workflows/windows_release_cu118_package.yml
+++ b/.github/workflows/windows_release_cu118_package.yml
@@ -45,6 +45,8 @@ jobs:
             sed -i '1i../ComfyUI' ./python310._pth
             cd ..
 
+            git clone https://github.com/comfyanonymous/taesd
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
 
             mkdir ComfyUI_windows_portable
             mv python_embeded ComfyUI_windows_portable
diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml
index dde50a73f..c7843d402 100644
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -37,6 +37,8 @@ jobs:
             sed -i '1i../ComfyUI' ./python311._pth
             cd ..
 
+            git clone https://github.com/comfyanonymous/taesd
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
 
             mkdir ComfyUI_windows_portable_nightly_pytorch
             mv python_embeded ComfyUI_windows_portable_nightly_pytorch

From 103c487a897bcf82cc8add4035f38ac920a7f150 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Sun, 2 Jul 2023 11:57:36 -0400
Subject: [PATCH 3/4] Cleanup.

---
 comfy/ldm/modules/attention.py | 11 +++++++----
 main.py                        |  4 ----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 0c54f7f47..5f9eaa6eb 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -16,11 +16,14 @@ if model_management.xformers_enabled():
     import xformers
     import xformers.ops
 
-# CrossAttn precision handling
-import os
-_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
-
 from comfy.cli_args import args
+# CrossAttn precision handling
+if args.dont_upcast_attention:
+    print("disabling upcasting of attention")
+    _ATTN_PRECISION = "fp16"
+else:
+    _ATTN_PRECISION = "fp32"
+
 
 def exists(val):
     return val is not None
diff --git a/main.py b/main.py
index 22425d2aa..715649975 100644
--- a/main.py
+++ b/main.py
@@ -14,10 +14,6 @@ if os.name == "nt":
     logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())
 
 if __name__ == "__main__":
-    if args.dont_upcast_attention:
-        print("disabling upcasting of attention")
-        os.environ['ATTN_PRECISION'] = "fp16"
-
     if args.cuda_device is not None:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device)
         print("Set cuda device to:", args.cuda_device)

From dc9d1f31c853503f238a2d6eacd0e4c68d9e9e42 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 3 Jul 2023 00:08:30 -0400
Subject: [PATCH 4/4] Improvements for OSX.

---
 comfy/model_management.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 74f3dadba..dcfd57b57 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -334,19 +334,19 @@ def unload_if_low_vram(model):
     return model
 
 def unet_offload_device():
-    if vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.SHARED:
+    if vram_state == VRAMState.HIGH_VRAM:
         return get_torch_device()
     else:
         return torch.device("cpu")
 
 def text_encoder_offload_device():
-    if args.gpu_only or vram_state == VRAMState.SHARED:
+    if args.gpu_only:
         return get_torch_device()
     else:
         return torch.device("cpu")
 
 def text_encoder_device():
-    if args.gpu_only or vram_state == VRAMState.SHARED:
+    if args.gpu_only:
         return get_torch_device()
     elif vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.NORMAL_VRAM:
         if torch.get_num_threads() < 8: #leaving the text encoder on the CPU is faster than shifting it if the CPU is fast enough.
@@ -360,7 +360,7 @@ def vae_device():
     return get_torch_device()
 
 def vae_offload_device():
-    if args.gpu_only or vram_state == VRAMState.SHARED:
+    if args.gpu_only:
         return get_torch_device()
     else:
         return torch.device("cpu")