diff --git a/comfy/model_management.py b/comfy/model_management.py index b6291f340..6018c1ab6 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -19,7 +19,7 @@ import psutil import logging from enum import Enum -from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram +from comfy.cli_args import args, PerformanceFeature import threading import torch import sys @@ -651,7 +651,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_ soft_empty_cache() return unloaded_models -def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): +def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): cleanup_models_gc() global vram_state @@ -747,26 +747,6 @@ def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, m current_loaded_models.insert(0, loaded_model) return -def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load): - with torch.inference_mode(): - load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load) - soft_empty_cache() - -def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False): - #Deliberately load models outside of the Aimdo mempool so they can be retained accross - #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are - #thread local. So exploit that to escape context - if enables_dynamic_vram(): - t = threading.Thread( - target=load_models_gpu_thread, - args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load) - ) - t.start() - t.join() - else: - load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights, - minimum_memory_required=minimum_memory_required, force_full_load=force_full_load) - def load_model_gpu(model): return load_models_gpu([model]) @@ -1226,21 +1206,16 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str if dtype is None: dtype = weight._model_dtype - r = torch.empty_like(weight, dtype=dtype, device=device) - signature = comfy_aimdo.model_vbar.vbar_fault(weight._v) if signature is not None: - raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device) - v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0] + v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0] if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature): weight._v_signature = signature #Send it over v_tensor.copy_(weight, non_blocking=non_blocking) - #always take a deep copy even if _v is good, as we have no reasonable point to unpin - #a non comfy weight - r.copy_(v_tensor) - comfy_aimdo.model_vbar.vbar_unpin(weight._v) - return r + return v_tensor.to(dtype=dtype) + + r = torch.empty_like(weight, dtype=dtype, device=device) if weight.dtype != r.dtype and weight.dtype != weight._model_dtype: #Offloaded casting could skip this, however it would make the quantizations diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index d888dbcfb..b9a117a7c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1492,7 +1492,9 @@ class ModelPatcherDynamic(ModelPatcher): if vbar is not None: vbar.prioritize() - #We have way more tools for acceleration on comfy weight offloading, so always + #We force reserve VRAM for the non comfy-weight so we dont have to deal + #with pin and unpin syncrhonization which can be expensive for small weights + #with a high layer rate (e.g. autoregressive LLMs). #prioritize the non-comfy weights (note the order reverse). loading = self._load_list(prio_comfy_cast_weights=True) loading.sort(reverse=True) @@ -1541,6 +1543,7 @@ class ModelPatcherDynamic(ModelPatcher): if vbar is not None and not hasattr(m, "_v"): m._v = vbar.alloc(v_weight_size) + m._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(m._v, device_to) allocated_size += v_weight_size else: @@ -1555,8 +1558,10 @@ class ModelPatcherDynamic(ModelPatcher): weight_size = geometry.numel() * geometry.element_size() if vbar is not None and not hasattr(weight, "_v"): weight._v = vbar.alloc(weight_size) + weight._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device_to) weight._model_dtype = model_dtype allocated_size += weight_size + vbar.set_watermark_limit(allocated_size) logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.") diff --git a/comfy/ops.py b/comfy/ops.py index 0f4eca7c7..ea0d70702 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -87,7 +87,7 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu signature = comfy_aimdo.model_vbar.vbar_fault(s._v) if signature is not None: - xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device) + xfer_dest = s._v_tensor resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) if not resident: diff --git a/execution.py b/execution.py index 3dbab82e6..896862c6b 100644 --- a/execution.py +++ b/execution.py @@ -13,8 +13,11 @@ from contextlib import nullcontext import torch +from comfy.cli_args import args import comfy.memory_management import comfy.model_management +import comfy_aimdo.model_vbar + from latent_preview import set_preview_method import nodes from comfy_execution.caching import ( @@ -527,8 +530,10 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data) finally: if allocator is not None: + if args.verbose == "DEBUG": + comfy_aimdo.model_vbar.vbars_analyze() comfy.model_management.reset_cast_buffers() - torch.cuda.synchronize() + comfy_aimdo.model_vbar.vbars_reset_watermark_limits() if has_pending_tasks: pending_async_nodes[unique_id] = output_data diff --git a/requirements.txt b/requirements.txt index 5e34a2a49..4fda07fde 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ alembic SQLAlchemy av>=14.2.0 comfy-kitchen>=0.2.7 -comfy-aimdo>=0.1.7 +comfy-aimdo>=0.1.8 requests #non essential dependencies: