Memory management and compilation improvements

- Experimental support for sage attention on Linux - Diffusers loader now supports model indices - Transformers model management now aligns with updates to ComfyUI - Flux layers correctly use unbind - Add float8 support for model loading in more places - Experimental quantization approaches from Quanto and torchao - Model upscaling interacts with memory management better This update also disables ROCm testing because it isn't reliable enough on consumer hardware. ROCm is not really supported by the 7600.
2026-01-10 06:10:50 +08:00 · 2024-10-09 09:11:22 -07:00 · 2024-10-09 09:11:22 -07:00 · bbe2ed330c
commit bbe2ed330c
parent 0a25b67ff8
16 changed files with 319 additions and 142 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -52,8 +52,6 @@ jobs:
        runner:
          - labels: [self-hosted, Linux, X64, cpu]
            container: "ubuntu:22.04"
          - labels: [self-hosted, Linux, X64, rocm-7600-8gb]
            container: "rocm/pytorch:rocm6.2_ubuntu22.04_py3.10_pytorch_release_2.3.0"
          - labels: [self-hosted, Linux, X64, cuda-3060-12gb]
            container: "nvcr.io/nvidia/pytorch:24.03-py3"
    steps:
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -99,6 +99,8 @@ def _create_parser() -> EnhancedConfigArgParser:
                            help="Use the new pytorch 2.0 cross attention function.")
    parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")
    parser.add_argument("--disable-flash-attn", action="store_true", help="Disable Flash Attention")
    parser.add_argument("--disable-sage-attention", action="store_true", help="Disable Sage Attention")
    upcast = parser.add_mutually_exclusive_group()
    upcast.add_argument("--force-upcast-attention", action="store_true", help="Force enable attention upcasting, please report if it fixes black images.")
--- a/comfy/cli_args_types.py
+++ b/comfy/cli_args_types.py
@ -75,6 +75,8 @@ class Configuration(dict):
        use_quad_cross_attention (bool): Use sub-quadratic cross-attention optimization.
        use_pytorch_cross_attention (bool): Use PyTorch's cross-attention function.
        disable_xformers (bool): Disable xformers.
        disable_flash_attn (bool): Disable flash_attn package attention.
        disable_sage_attention (bool): Disable sage attention package attention.
        gpu_only (bool): Run everything on the GPU.
        highvram (bool): Keep models in GPU memory.
        normalvram (bool): Default VRAM usage setting.
@ -157,6 +159,8 @@ class Configuration(dict):
        self.use_quad_cross_attention: bool = False
        self.use_pytorch_cross_attention: bool = False
        self.disable_xformers: bool = False
        self.disable_flash_attn: bool = False
        self.disable_sage_attention: bool = False
        self.gpu_only: bool = False
        self.highvram: bool = False
        self.normalvram: bool = False
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@ -13,9 +13,17 @@ def first_file(path, filenames) -> str | None:
    return None
-def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None):
+def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None, model_options=None):
-    diffusion_model_names = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.fp16.bin", "diffusion_pytorch_model.bin"]
+    if model_options is None:
-    unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names)
+        model_options = {}
    diffusion_model_names = [
        "diffusion_pytorch_model.fp16.safetensors",
        "diffusion_pytorch_model.safetensors",
        "diffusion_pytorch_model.fp16.bin",
        "diffusion_pytorch_model.bin",
        "diffusion_pytorch_model.safetensors.index.json"
    ]
    unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names) or  first_file(os.path.join(model_path, "transformer"), diffusion_model_names)
    vae_path = first_file(os.path.join(model_path, "vae"), diffusion_model_names)
    text_encoder_model_names = ["model.fp16.safetensors", "model.safetensors", "pytorch_model.fp16.bin", "pytorch_model.bin"]
@ -28,7 +36,7 @@ def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_dire
    unet = None
    if unet_path is not None:
-        unet = sd.load_diffusion_model(unet_path)
+        unet = sd.load_diffusion_model(unet_path, model_options=model_options)
    clip = None
    textmodel_json_config1 = first_file(os.path.join(model_path, "text_encoder"), ["config.json"])
--- a/comfy/language/transformers_model_management.py
+++ b/comfy/language/transformers_model_management.py
@ -79,13 +79,13 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
            # if we have flash-attn installed, try to use it
            try:
-                import flash_attn
+                if model_management.flash_attn_enabled():
-                attn_override_kwargs = {
+                    attn_override_kwargs = {
-                    "attn_implementation": "flash_attention_2",
+                        "attn_implementation": "flash_attention_2",
-                    **kwargs_to_try[0]
+                        **kwargs_to_try[0]
-                }
+                    }
-                kwargs_to_try = (attn_override_kwargs, *kwargs_to_try)
+                    kwargs_to_try = (attn_override_kwargs, *kwargs_to_try)
-                logging.debug(f"while loading model {ckpt_name}, flash_attn was installed, so the flash_attention_2 implementation will be tried")
+                    logging.debug(f"while loading model {ckpt_name}, flash_attn was installed, so the flash_attention_2 implementation will be tried")
            except ImportError:
                pass
            for i, props in enumerate(kwargs_to_try):
@ -303,16 +303,13 @@ class TransformersManagedModel(ModelManageable, LanguageModel):
    def model_dtype(self) -> torch.dtype:
        return self.model.dtype
-    def patch_model_lowvram(self, device_to: torch.device, lowvram_model_memory: int, force_patch_weights=False) -> torch.nn.Module:
+
-        warnings.warn("Transformers models do not currently support adapters like LoRAs")
+    def patch_model(self, device_to: torch.device | None = None, lowvram_model_memory: int = 0, load_weights: bool = True, force_patch_weights: bool = False) -> torch.nn.Module:
        return self.model.to(device=device_to)
-    def patch_model(self, device_to: torch.device | None = None, patch_weights: bool = True) -> torch.nn.Module:
+    def unpatch_model(self, device_to: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
        return self.model.to(device=device_to)
    def unpatch_model(self, offload_device: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
        return self.model.to(device=offload_device)
    def patch_processor(self, processor: Any, overwrite_tokenizer: bool = False) -> TransformersManagedModel:
        model = copy.copy(self)
        model._processor = processor
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -149,14 +149,16 @@ class DoubleStreamBlock(nn.Module):
        img_modulated = self.img_norm1(img)
        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        img_qkv = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k, img_v = torch.unbind(img_qkv, dim=0)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        txt_qkv = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k, txt_v = torch.unbind(txt_qkv, dim=0)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
        # run actual attention
@ -221,7 +223,8 @@ class SingleStreamBlock(nn.Module):
        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        qkv = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = torch.unbind(qkv, dim=0)
        q, k = self.norm(q, k, v)
        # compute attention
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -1,10 +1,12 @@
 import logging
 import math
 from functools import wraps
 from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
 from einops import rearrange, repeat
-from typing import Optional
+from torch import nn, einsum
 import logging
 from .diffusionmodules.util import AlphaBlender, timestep_embedding
 from .sub_quadratic_attention import efficient_dot_product_attention
@ -12,14 +14,22 @@ from ... import model_management
 if model_management.xformers_enabled():
    import xformers  # pylint: disable=import-error
-    import xformers.ops # pylint: disable=import-error
+    import xformers.ops  # pylint: disable=import-error
 if model_management.sage_attention_enabled():
    from sageattention import sageattn
 if model_management.flash_attn_enabled():
    from flash_attn import flash_attn_func
 from ...cli_args import args
 from ... import ops
 ops = ops.disable_weight_init
 FORCE_UPCAST_ATTENTION_DTYPE = model_management.force_upcast_attention_dtype()
 def get_attn_precision(attn_precision):
    if args.dont_upcast_attention:
        return None
@ -27,12 +37,13 @@ def get_attn_precision(attn_precision):
        return FORCE_UPCAST_ATTENTION_DTYPE
    return attn_precision
 def exists(val):
    return val is not None
 def uniq(arr):
-    return{el: True for el in arr}.keys()
+    return {el: True for el in arr}.keys()
 def default(val, d):
@ -82,9 +93,11 @@ class FeedForward(nn.Module):
    def forward(self, x):
        return self.net(x)
 def Normalize(in_channels, dtype=None, device=None):
    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
 def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    attn_precision = get_attn_precision(attn_precision)
@ -98,7 +111,7 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    h = heads
    if skip_reshape:
-         q, k, v = map(
+        q, k, v = map(
            lambda t: t.reshape(b * heads, -1, dim_head),
            (q, k, v),
        )
@ -122,7 +135,7 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    if exists(mask):
        if mask.dtype == torch.bool:
-            mask = rearrange(mask, 'b ... -> b (...)') #TODO: check if this bool part matches pytorch attention
+            mask = rearrange(mask, 'b ... -> b (...)')  # TODO: check if this bool part matches pytorch attention
            max_neg_value = -torch.finfo(sim.dtype).max
            mask = repeat(mask, 'b j -> (b h) () j', h=h)
            sim.masked_fill_(~mask, max_neg_value)
@ -167,13 +180,12 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
        value = value.unsqueeze(3).reshape(b, -1, heads, dim_head).permute(0, 2, 1, 3).reshape(b * heads, -1, dim_head)
        key = key.unsqueeze(3).reshape(b, -1, heads, dim_head).permute(0, 2, 3, 1).reshape(b * heads, dim_head, -1)
    dtype = query.dtype
    upcast_attention = attn_precision == torch.float32 and query.dtype != torch.float32
    if upcast_attention:
-        bytes_per_token = torch.finfo(torch.float32).bits//8
+        bytes_per_token = torch.finfo(torch.float32).bits // 8
    else:
-        bytes_per_token = torch.finfo(query.dtype).bits//8
+        bytes_per_token = torch.finfo(query.dtype).bits // 8
    batch_x_heads, q_tokens, _ = query.shape
    _, _, k_tokens = key.shape
    qk_matmul_size_bytes = batch_x_heads * bytes_per_token * q_tokens * k_tokens
@ -215,9 +227,10 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
    hidden_states = hidden_states.to(dtype)
-    hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1,2).flatten(start_dim=2)
+    hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1, 2).flatten(start_dim=2)
    return hidden_states
 def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    attn_precision = get_attn_precision(attn_precision)
@ -231,7 +244,7 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    h = heads
    if skip_reshape:
-         q, k, v = map(
+        q, k, v = map(
            lambda t: t.reshape(b * heads, -1, dim_head),
            (q, k, v),
        )
@ -262,16 +275,15 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    mem_required = tensor_size * modifier
    steps = 1
    if mem_required > mem_free_total:
-        steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
+        steps = 2 ** (math.ceil(math.log(mem_required / mem_free_total, 2)))
        # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
        #      f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
    if steps > 64:
        max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
        raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
-                            f'Need: {mem_required/64/gb:0.1f}GB free, Have:{mem_free_total/gb:0.1f}GB free')
+                           f'Need: {mem_required / 64 / gb:0.1f}GB free, Have:{mem_free_total / gb:0.1f}GB free')
    if mask is not None:
        if len(mask.shape) == 2:
@ -289,7 +301,7 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
            for i in range(0, q.shape[1], slice_size):
                end = i + slice_size
                if upcast:
-                    with torch.autocast(enabled=False, device_type = 'cuda'):
+                    with torch.autocast(enabled=False, device_type='cuda'):
                        s1 = einsum('b i d, b j d -> b i j', q[:, i:end].float(), k.float()) * scale
                else:
                    s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * scale
@ -331,11 +343,6 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    )
    return r1
 BROKEN_XFORMERS = False
 if model_management.xformers_enabled():
    x_vers = xformers.__version__
    # XFormers bug confirmed on all versions from 0.0.21 to 0.0.26 (q with bs bigger than 65535 gives CUDA error)
    BROKEN_XFORMERS = x_vers.startswith("0.0.2") and not x_vers.startswith("0.0.20")
 def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    if skip_reshape:
@ -346,10 +353,6 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
    disabled_xformers = False
    if BROKEN_XFORMERS:
        if b * heads > 65535:
            disabled_xformers = True
    if not disabled_xformers:
        if torch.jit.is_tracing() or torch.jit.is_scripting():
            disabled_xformers = True
@ -358,7 +361,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)
    if skip_reshape:
-         q, k, v = map(
+        q, k, v = map(
            lambda t: t.reshape(b * heads, -1, dim_head),
            (q, k, v),
        )
@ -390,22 +393,36 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
    return out
-def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
+def pytorch_style_decl(func):
-    if skip_reshape:
+    @wraps(func)
-        b, _, _, dim_head = q.shape
+    def wrapper(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
-    else:
+        if skip_reshape:
-        b, _, dim_head = q.shape
+            b, _, _, dim_head = q.shape
-        dim_head //= heads
+        else:
-        q, k, v = map(
+            b, _, dim_head = q.shape
-            lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
+            dim_head //= heads
-            (q, k, v),
+            q, k, v = map(
-        )
+                lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
                (q, k, v),
            )
-    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+        out = func(q, k, v, heads, mask=mask, attn_precision=attn_precision, skip_reshape=skip_reshape)
-    out = (
+        out = out.transpose(1, 2).reshape(b, -1, heads * dim_head)
-        out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+        return out
-    )
+
-    return out
+    return wrapper
@pytorch_style_decl
 def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
@pytorch_style_decl
 def attention_sagemaker(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    return sageattn(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
@pytorch_style_decl
 def attention_flash_attn(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    return flash_attn_func(q, k, v)
 optimized_attention = attention_basic
@ -426,10 +443,11 @@ else:
 optimized_attention_masked = optimized_attention
 def optimized_attention_for_device(device, mask=False, small_input=False):
    if small_input:
        if model_management.pytorch_attention_enabled():
-            return attention_pytorch #TODO: need to confirm but this is probably slightly faster for small inputs in all cases
+            return attention_pytorch  # TODO: need to confirm but this is probably slightly faster for small inputs in all cases
        else:
            return attention_basic
@ -493,7 +511,7 @@ class BasicTransformerBlock(nn.Module):
        self.disable_self_attn = disable_self_attn
        self.attn1 = CrossAttention(query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout,
-                              context_dim=context_dim if self.disable_self_attn else None, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)  # is a self-attention if not self.disable_self_attn
+                                    context_dim=context_dim if self.disable_self_attn else None, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)  # is a self-attention if not self.disable_self_attn
        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
        if disable_temporal_crossattention:
@ -507,7 +525,7 @@ class BasicTransformerBlock(nn.Module):
                context_dim_attn2 = context_dim
            self.attn2 = CrossAttention(query_dim=inner_dim, context_dim=context_dim_attn2,
-                                heads=n_heads, dim_head=d_head, dropout=dropout, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)  # is self-attn if context is none
+                                        heads=n_heads, dim_head=d_head, dropout=dropout, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)  # is self-attn if context is none
            self.norm2 = operations.LayerNorm(inner_dim, dtype=dtype, device=device)
        self.norm1 = operations.LayerNorm(inner_dim, dtype=dtype, device=device)
@ -641,6 +659,7 @@ class SpatialTransformer(nn.Module):
    Finally, reshape to image
    NEW: use_linear for more efficiency instead of the 1x1 convs
    """
    def __init__(self, in_channels, n_heads, d_head,
                 depth=1, dropout=0., context_dim=None,
                 disable_self_attn=False, use_linear=False,
@ -653,23 +672,23 @@ class SpatialTransformer(nn.Module):
        self.norm = operations.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
        if not use_linear:
            self.proj_in = operations.Conv2d(in_channels,
-                                     inner_dim,
+                                             inner_dim,
-                                     kernel_size=1,
+                                             kernel_size=1,
-                                     stride=1,
+                                             stride=1,
-                                     padding=0, dtype=dtype, device=device)
+                                             padding=0, dtype=dtype, device=device)
        else:
            self.proj_in = operations.Linear(in_channels, inner_dim, dtype=dtype, device=device)
        self.transformer_blocks = nn.ModuleList(
            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
-                for d in range(depth)]
+             for d in range(depth)]
        )
        if not use_linear:
-            self.proj_out = operations.Conv2d(inner_dim,in_channels,
+            self.proj_out = operations.Conv2d(inner_dim, in_channels,
-                                                  kernel_size=1,
+                                              kernel_size=1,
-                                                  stride=1,
+                                              stride=1,
-                                                  padding=0, dtype=dtype, device=device)
+                                              padding=0, dtype=dtype, device=device)
        else:
            self.proj_out = operations.Linear(in_channels, inner_dim, dtype=dtype, device=device)
        self.use_linear = use_linear
@ -699,27 +718,27 @@ class SpatialTransformer(nn.Module):
 class SpatialVideoTransformer(SpatialTransformer):
    def __init__(
-        self,
+            self,
-        in_channels,
+            in_channels,
-        n_heads,
+            n_heads,
-        d_head,
+            d_head,
-        depth=1,
+            depth=1,
-        dropout=0.0,
+            dropout=0.0,
-        use_linear=False,
+            use_linear=False,
-        context_dim=None,
+            context_dim=None,
-        use_spatial_context=False,
+            use_spatial_context=False,
-        timesteps=None,
+            timesteps=None,
-        merge_strategy: str = "fixed",
+            merge_strategy: str = "fixed",
-        merge_factor: float = 0.5,
+            merge_factor: float = 0.5,
-        time_context_dim=None,
+            time_context_dim=None,
-        ff_in=False,
+            ff_in=False,
-        checkpoint=False,
+            checkpoint=False,
-        time_depth=1,
+            time_depth=1,
-        disable_self_attn=False,
+            disable_self_attn=False,
-        disable_temporal_crossattention=False,
+            disable_temporal_crossattention=False,
-        max_time_embed_period: int = 10000,
+            max_time_embed_period: int = 10000,
-        attn_precision=None,
+            attn_precision=None,
-        dtype=None, device=None, operations=ops
+            dtype=None, device=None, operations=ops
    ):
        super().__init__(
            in_channels,
@ -785,13 +804,13 @@ class SpatialVideoTransformer(SpatialTransformer):
        )
    def forward(
-        self,
+            self,
-        x: torch.Tensor,
+            x: torch.Tensor,
-        context: Optional[torch.Tensor] = None,
+            context: Optional[torch.Tensor] = None,
-        time_context: Optional[torch.Tensor] = None,
+            time_context: Optional[torch.Tensor] = None,
-        timesteps: Optional[int] = None,
+            timesteps: Optional[int] = None,
-        image_only_indicator: Optional[torch.Tensor] = None,
+            image_only_indicator: Optional[torch.Tensor] = None,
-        transformer_options={}
+            transformer_options={}
    ) -> torch.Tensor:
        _, _, h, w = x.shape
        x_in = x
@ -801,7 +820,7 @@ class SpatialVideoTransformer(SpatialTransformer):
        if self.use_spatial_context:
            assert (
-                context.ndim == 3
+                    context.ndim == 3
            ), f"n dims of spatial context should be 3 but are {context.ndim}"
            if time_context is None:
@ -830,7 +849,7 @@ class SpatialVideoTransformer(SpatialTransformer):
        emb = emb[:, None, :]
        for it_, (block, mix_block) in enumerate(
-            zip(self.transformer_blocks, self.time_stack)
+                zip(self.transformer_blocks, self.time_stack)
        ):
            transformer_options["block_index"] = it_
            x = block(
@ -844,7 +863,7 @@ class SpatialVideoTransformer(SpatialTransformer):
            B, S, C = x_mix.shape
            x_mix = rearrange(x_mix, "(b t) s c -> (b s) t c", t=timesteps)
-            x_mix = mix_block(x_mix, context=time_context) #TODO: transformer_options
+            x_mix = mix_block(x_mix, context=time_context)  # TODO: transformer_options
            x_mix = rearrange(
                x_mix, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
            )
@ -858,5 +877,3 @@ class SpatialVideoTransformer(SpatialTransformer):
            x = self.proj_out(x)
        out = x + x_in
        return out
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -108,7 +108,6 @@ class BaseModel(torch.nn.Module):
                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
            if model_management.force_channels_last():
                # todo: ???
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
            logging.debug("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@ -535,10 +535,6 @@ def get_huggingface_repo_list(*extra_cache_dirs: str) -> List[str]:
        for user_dir in Path(local_dir_root).iterdir():
            for model_dir in user_dir.iterdir():
                try:
                    _hf_fs.resolve_path(str(user_dir / model_dir))
                except Exception as exc_info:
                    logging.debug(f"HuggingFaceFS did not think this was a valid repo: {user_dir.name}/{model_dir.name} with error {exc_info}", exc_info)
                existing_local_dir_repos.add(f"{user_dir.name}/{model_dir.name}")
    known_repo_ids = frozenset(KNOWN_HUGGINGFACE_MODEL_REPOS)
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -23,7 +23,7 @@ import sys
 import warnings
 from enum import Enum
 from threading import RLock
-from typing import Literal, List, Sequence
+from typing import Literal, List, Sequence, Final
 import psutil
 import torch
@ -128,6 +128,9 @@ def get_torch_device():
            return torch.device("xpu", torch.xpu.current_device())
        else:
            try:
                # https://github.com/sayakpaul/diffusers-torchao/blob/bade7a6abb1cab9ef44782e6bcfab76d0237ae1f/inference/benchmark_image.py#L3
                # This setting optimizes performance on NVIDIA GPUs with Ampere architecture (e.g., A100, RTX 30 series) or newer.
                torch.set_float32_matmul_precision("high")
                return torch.device(torch.cuda.current_device())
            except:
                warnings.warn("torch.cuda.current_device() did not return a device, returning a CPU torch device")
@ -319,7 +322,7 @@ try:
 except:
    logging.warning("Could not pick default device.")
-current_loaded_models: List["LoadedModel"] = []
+current_loaded_models: Final[List["LoadedModel"]] = []
 def module_size(module):
@ -974,6 +977,22 @@ def cast_to_device(tensor, device, dtype, copy=False):
        else:
            return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)
 FLASH_ATTENTION_ENABLED = False
 if not args.disable_flash_attn:
    try:
        import flash_attn
        FLASH_ATTENTION_ENABLED = True
    except ImportError:
        pass
 SAGE_ATTENTION_ENABLED = False
 if not args.disable_sage_attention:
    try:
        import sageattention
        SAGE_ATTENTION_ENABLED = True
    except ImportError:
        pass
 def xformers_enabled():
    global directml_device
@ -986,6 +1005,30 @@ def xformers_enabled():
        return False
    return XFORMERS_IS_AVAILABLE
 def flash_attn_enabled():
    global directml_device
    global cpu_state
    if cpu_state != CPUState.GPU:
        return False
    if is_intel_xpu():
        return False
    if directml_device:
        return False
    return FLASH_ATTENTION_ENABLED
 def sage_attention_enabled():
    global directml_device
    global cpu_state
    if cpu_state != CPUState.GPU:
        return False
    if is_intel_xpu():
        return False
    if directml_device:
        return False
    if xformers_enabled():
        return False
    return SAGE_ATTENTION_ENABLED
 def xformers_enabled_vae():
    enabled = xformers_enabled()
--- a/comfy/model_management_types.py
+++ b/comfy/model_management_types.py
@ -55,17 +55,13 @@ class ModelManageable(Protocol):
    def model_dtype(self) -> torch.dtype:
        return next(self.model.parameters()).dtype
-    def patch_model_lowvram(self, device_to: torch.device, lowvram_model_memory: int, force_patch_weights: Optional[bool] = False) -> torch.nn.Module:
+    def patch_model(self, device_to: torch.device | None = None, lowvram_model_memory: int = 0, load_weights: bool = True, force_patch_weights: bool = False) -> torch.nn.Module:
        self.patch_model(device_to=device_to, patch_weights=False)
        return self.model
    def patch_model(self, device_to: torch.device | None = None, patch_weights: bool = True) -> torch.nn.Module:
        ...
-    def unpatch_model(self, offload_device: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
+    def unpatch_model(self, device_to: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
        """
        Unloads the model by moving it to the offload device
-        :param offload_device:
+        :param device_to:
        :param unpatch_weights:
        :return:
        """
@ -99,6 +95,20 @@ class ModelManageable(Protocol):
    def current_loaded_device(self) -> torch.device:
        return self.current_device
    def get_model_object(self, name: str) -> torch.nn.Module:
        from . import utils
        return utils.get_attr(self.model, name)
    @property
    def model_options(self) -> dict:
        if not hasattr(self, "_model_options"):
            setattr(self, "_model_options", {"transformer_options": {}})
        return getattr(self, "_model_options")
    @model_options.setter
    def model_options(self, value):
        setattr(self, "_model_options", value)
@dataclasses.dataclass
 class MemoryMeasurements:
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -27,10 +27,11 @@ import torch.nn
 from . import model_management, lora
 from . import utils
 from .comfy_types import UnetWrapperFunction
 from .float import stochastic_rounding
 from .model_base import BaseModel
 from .model_management_types import ModelManageable, MemoryMeasurements
-from .comfy_types import UnetWrapperFunction
+
 def string_to_seed(data):
    crc = 0xFFFFFFFF
@ -45,6 +46,7 @@ def string_to_seed(data):
                crc >>= 1
    return crc ^ 0xFFFFFFFF
 def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
    to = model_options["transformer_options"].copy()
@ -106,7 +108,7 @@ class ModelPatcher(ModelManageable):
        self.backup = {}
        self.object_patches = {}
        self.object_patches_backup = {}
-        self.model_options = {"transformer_options": {}}
+        self._model_options = {"transformer_options": {}}
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
@ -115,6 +117,14 @@ class ModelPatcher(ModelManageable):
        self.ckpt_name = ckpt_name
        self._memory_measurements = MemoryMeasurements(self.model)
    @property
    def model_options(self) -> dict:
        return self._model_options
    @model_options.setter
    def model_options(self, value):
        self._model_options = value
    @property
    def model_device(self) -> torch.device:
        return self._memory_measurements.device
@ -145,7 +155,7 @@ class ModelPatcher(ModelManageable):
        n.patches_uuid = self.patches_uuid
        n.object_patches = self.object_patches.copy()
-        n.model_options = copy.deepcopy(self.model_options)
+        n._model_options = copy.deepcopy(self.model_options)
        n.backup = self.backup
        n.object_patches_backup = self.object_patches_backup
        return n
@ -260,6 +270,11 @@ class ModelPatcher(ModelManageable):
                self.model_options["model_function_wrapper"] = wrap_func.to(device)
    def model_dtype(self):
        # this pokes into the internals of diffusion model a little bit
        # todo: the base model isn't going to be aware that its diffusion model is patched this way
        if isinstance(self.model, BaseModel):
            diffusion_model = self.get_model_object("diffusion_model")
            return diffusion_model.dtype
        if hasattr(self.model, "get_dtype"):
            return self.model.get_dtype()
@ -293,7 +308,7 @@ class ModelPatcher(ModelManageable):
            if filter_prefix is not None:
                if not k.startswith(filter_prefix):
                    continue
-            bk = self.backup.get(k, None)
+            bk: torch.nn.Module | None = self.backup.get(k, None)
            if bk is not None:
                weight = bk.weight
            else:
@ -494,7 +509,7 @@ class ModelPatcher(ModelManageable):
            if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
                for key in [weight_key, bias_key]:
-                    bk = self.backup.get(key, None)
+                    bk: torch.nn.Module | None = self.backup.get(key, None)
                    if bk is not None:
                        if bk.inplace_update:
                            utils.copy_to_param(self.model, key, bk.weight)
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@ -538,14 +538,16 @@ class DiffusersLoader:
        paths += get_huggingface_repo_list()
        paths = list(frozenset(paths))
-        return {"required": {"model_path": (paths,), }}
+        return {"required": {"model_path": (paths,),
                             "weight_dtype": (FLUX_WEIGHT_DTYPES,)
                             }}
    RETURN_TYPES = ("MODEL", "CLIP", "VAE")
    FUNCTION = "load_checkpoint"
    CATEGORY = "advanced/loaders"
-    def load_checkpoint(self, model_path, output_vae=True, output_clip=True):
+    def load_checkpoint(self, model_path, output_vae=True, output_clip=True,weight_dtype:str="default"):
        for search_path in folder_paths.get_folder_paths("diffusers"):
            if os.path.exists(search_path):
                path = os.path.join(search_path, model_path)
@ -556,7 +558,8 @@ class DiffusersLoader:
            with comfy_tqdm():
                model_path = snapshot_download(model_path)
-        return diffusers_load.load_diffusers(model_path, output_vae=output_vae, output_clip=output_clip, embedding_directory=folder_paths.get_folder_paths("embeddings"))
+        model_options = get_model_options_for_dtype(weight_dtype)
        return diffusers_load.load_diffusers(model_path, output_vae=output_vae, output_clip=output_clip, embedding_directory=folder_paths.get_folder_paths("embeddings"), model_options=model_options)
 class unCLIPCheckpointLoader:
@ -875,6 +878,14 @@ class ControlNetApplyAdvanced:
            out.append(c)
        return (out[0], out[1])
 def get_model_options_for_dtype(weight_dtype):
    model_options = {}
    if weight_dtype == "fp8_e4m3fn":
        model_options["dtype"] = torch.float8_e4m3fn
    elif weight_dtype == "fp8_e5m2":
        model_options["dtype"] = torch.float8_e5m2
    return model_options
 class UNETLoader:
    @classmethod
@ -888,16 +899,14 @@ class UNETLoader:
    CATEGORY = "advanced/loaders"
    def load_unet(self, unet_name, weight_dtype):
-        model_options = {}
+        model_options = get_model_options_for_dtype(weight_dtype)
        if weight_dtype == "fp8_e4m3fn":
            model_options["dtype"] = torch.float8_e4m3fn
        elif weight_dtype == "fp8_e5m2":
            model_options["dtype"] = torch.float8_e5m2
        unet_path = get_or_download("diffusion_models", unet_name, KNOWN_UNET_MODELS)
        model = sd.load_diffusion_model(unet_path, model_options=model_options)
        return (model,)
 class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -19,15 +19,19 @@ from __future__ import annotations
 import contextlib
 import itertools
 import json
 import logging
 import math
 import os
 import random
 import struct
 import sys
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Optional, Any
 import accelerate
 import numpy as np
 import safetensors.torch
 import torch
@ -55,13 +59,27 @@ def _get_progress_bar_enabled():
 setattr(sys.modules[__name__], 'PROGRESS_BAR_ENABLED', property(_get_progress_bar_enabled))
-def load_torch_file(ckpt, safe_load=False, device=None):
+def load_torch_file(ckpt: str, safe_load=False, device=None):
    if device is None:
        device = torch.device("cpu")
    if ckpt is None:
        raise FileNotFoundError("the checkpoint was not found")
    if ckpt.lower().endswith(".safetensors") or ckpt.lower().endswith(".sft"):
        sd = safetensors.torch.load_file(ckpt, device=device.type)
    elif ckpt.lower().endswith("index.json"):
        # from accelerate
        index_filename = ckpt
        checkpoint_folder = os.path.split(index_filename)[0]
        with open(index_filename) as f:
            index = json.loads(f.read())
        if "weight_map" in index:
            index = index["weight_map"]
        checkpoint_files = sorted(list(set(index.values())))
        checkpoint_files = [os.path.join(checkpoint_folder, f) for f in checkpoint_files]
        sd: dict[str, torch.Tensor] = {}
        for checkpoint_file in checkpoint_files:
            sd.update(safetensors.torch.load_file(str(checkpoint_file), device=device.type))
    else:
        if safe_load:
            if not 'weights_only' in torch.load.__code__.co_varnames:
--- a/comfy_extras/nodes/nodes_torch_compile.py
+++ b/comfy_extras/nodes/nodes_torch_compile.py
@ -1,8 +1,11 @@
 import logging
 import torch
 from torch.nn import LayerNorm
 from comfy import model_management
 from comfy.model_patcher import ModelPatcher
 from comfy.nodes.package_typing import CustomNode, InputTypes
 DIFFUSION_MODEL = "diffusion_model"
@ -47,6 +50,65 @@ class TorchCompileModel:
            return model,
 class QuantizeModel(CustomNode):
    @classmethod
    def INPUT_TYPES(cls) -> InputTypes:
        return {
            "required": {
                "model": ("MODEL", {}),
                "strategy": (["torchao", "quanto"], {"default": "torchao"})
            }
        }
    FUNCTION = "execute"
    CATEGORY = "_for_testing"
    EXPERIMENTAL = True
    RETURN_TYPES = ("MODEL",)
    def execute(self, model: ModelPatcher, strategy: str = "torchao"):
        logging.warning(f"Quantizing {model} this way quantizes it in place, making it insuitable for cloning. All uses of this model will be quantized.")
        logging.warning(f"Quantizing {model} will produce poor results due to Optimum's limitations")
        model = model.clone()
        unet = model.get_model_object("diffusion_model")
        # todo: quantize quantizes in place, which is not desired
        # default exclusions
        _unused_exclusions = {
            "time_embedding.",
            "add_embedding.",
            "time_in.",
            "txt_in.",
            "vector_in.",
            "img_in.",
            "guidance_in.",
            "final_layer.",
        }
        if strategy == "quanto":
            from optimum.quanto import quantize, qint8
            exclusion_list = [
                name for name, module in unet.named_modules() if isinstance(module, LayerNorm) and module.weight is None
            ]
            quantize(unet, weights=qint8, activations=qint8, exclude=exclusion_list)
            _in_place_fixme = unet
        elif strategy == "torchao":
            from torchao.quantization import quantize_, int8_dynamic_activation_int8_weight
            model = model.clone()
            unet = model.get_model_object("diffusion_model")
            # todo: quantize quantizes in place, which is not desired
            # def filter_fn(module: torch.nn.Module, name: str):
            #     return any("weight" in name for name, _ in (module.named_parameters())) and all(exclusion not in name for exclusion in exclusions)
            quantize_(unet, int8_dynamic_activation_int8_weight(), device=model_management.get_torch_device())
            _in_place_fixme = unet
        else:
            raise ValueError(f"unknown strategy {strategy}")
        model.add_object_patch("diffusion_model", _in_place_fixme)
        return model,
 NODE_CLASS_MAPPINGS = {
    "TorchCompileModel": TorchCompileModel,
    "QuantizeModel": QuantizeModel,
 }
--- a/comfy_extras/nodes/nodes_upscale_model.py
+++ b/comfy_extras/nodes/nodes_upscale_model.py
@ -88,18 +88,14 @@ class UpscaleModelManageable(ModelManageable):
    def model_dtype(self) -> torch.dtype:
        return next(self.model.parameters()).dtype
-    def patch_model_lowvram(self, device_to: torch.device, lowvram_model_memory: int, force_patch_weights: Optional[bool] = False) -> torch.nn.Module:
+    def patch_model(self, device_to: torch.device | None = None, lowvram_model_memory: int = 0, load_weights: bool = True, force_patch_weights: bool = False) -> torch.nn.Module:
        self.model.to(device=device_to)
        return self.model
-    def patch_model(self, device_to: torch.device | None = None, patch_weights: bool = True) -> torch.nn.Module:
+    def unpatch_model(self, device_to: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
        self.model.to(device=device_to)
        return self.model
    def unpatch_model(self, offload_device: torch.device | None = None, unpatch_weights: Optional[bool] = False) -> torch.nn.Module:
        self.model.to(device=offload_device)
        return self.model
    def __str__(self):
        if self.ckpt_name is not None:
            return f"<UpscaleModelManageable for {self.ckpt_name} ({self.model.__class__.__name__})>"