Custom guider for correct offloading behavior

2026-02-06 11:32:31 +08:00 · 2025-12-05 17:24:55 +08:00 · 2025-12-05 17:24:55 +08:00 · 4004af3290
commit 4004af3290
parent bf573e94a2
2 changed files with 97 additions and 19 deletions
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@ -122,20 +122,21 @@ def estimate_memory(model, noise_shape, conds):
    minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
    return memory_required, minimum_memory_required
-def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, skip_load_model=False):
    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
        _prepare_sampling,
        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
    )
-    return executor.execute(model, noise_shape, conds, model_options=model_options)
+    return executor.execute(model, noise_shape, conds, model_options=model_options, skip_load_model=skip_load_model)
-def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, skip_load_model=False):
    real_model: BaseModel = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
-    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory)
+    models_list = [model] if not skip_load_model else []
    comfy.model_management.load_models_gpu(models_list + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory)
    real_model = model.model
    return real_model, conds, models
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@ -10,6 +10,7 @@ from PIL import Image, ImageDraw, ImageFont
 from typing_extensions import override
 import comfy.samplers
 import comfy.sampler_helpers
 import comfy.sd
 import comfy.utils
 import comfy.model_management
@ -21,6 +22,68 @@ from comfy_api.latest import ComfyExtension, io, ui
 from comfy.utils import ProgressBar
 class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic):
    """
    CFGGuider with modifications for training specific logic
    """
    def outer_sample(
        self,
        noise,
        latent_image,
        sampler,
        sigmas,
        denoise_mask=None,
        callback=None,
        disable_pbar=False,
        seed=None,
        latent_shapes=None,
    ):
        self.inner_model, self.conds, self.loaded_models = (
            comfy.sampler_helpers.prepare_sampling(
                self.model_patcher,
                noise.shape,
                self.conds,
                self.model_options,
                skip_load_model=True, # skip load model as we manage it in TrainLoraNode.execute()
            )
        )
        device = self.model_patcher.load_device
        if denoise_mask is not None:
            denoise_mask = comfy.sampler_helpers.prepare_mask(
                denoise_mask, noise.shape, device
            )
        noise = noise.to(device)
        latent_image = latent_image.to(device)
        sigmas = sigmas.to(device)
        comfy.samplers.cast_to_load_options(
            self.model_options, device=device, dtype=self.model_patcher.model_dtype()
        )
        try:
            self.model_patcher.pre_run()
            output = self.inner_sample(
                noise,
                latent_image,
                device,
                sampler,
                sigmas,
                denoise_mask,
                callback,
                disable_pbar,
                seed,
                latent_shapes=latent_shapes,
            )
        finally:
            self.model_patcher.cleanup()
        comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models)
        del self.inner_model
        del self.loaded_models
        return output
 def make_batch_extra_option_dict(d, indicies, full_size=None):
    new_dict = {}
    for k, v in d.items():
@ -77,7 +140,9 @@ class TrainSampler(comfy.samplers.Sampler):
        self.training_dtype = training_dtype
        self.real_dataset: list[torch.Tensor] | None = real_dataset
        # Bucket mode data
-        self.bucket_latents: list[torch.Tensor] | None = bucket_latents  # list of (Bi, C, Hi, Wi)
+        self.bucket_latents: list[torch.Tensor] | None = (
            bucket_latents  # list of (Bi, C, Hi, Wi)
        )
        # Precompute bucket offsets and weights for sampling
        if bucket_latents is not None:
            self._init_bucket_data(bucket_latents)
@ -511,7 +576,9 @@ def _load_existing_lora(existing_lora):
    return existing_weights, existing_steps
-def _create_weight_adapter(module, module_name, existing_weights, algorithm, lora_dtype, rank):
+def _create_weight_adapter(
    module, module_name, existing_weights, algorithm, lora_dtype, rank
 ):
    """Create a weight adapter for a module with weight.
    Args:
@ -663,7 +730,9 @@ def _create_loss_function(loss_function_name):
        return torch.nn.SmoothL1Loss()
-def _run_training_loop(guider, train_sampler, latents, num_images, seed, bucket_mode, multi_res):
+def _run_training_loop(
    guider, train_sampler, latents, num_images, seed, bucket_mode, multi_res
 ):
    """Execute the training loop.
    Args:
@ -815,11 +884,6 @@ class TrainLoraNode(io.ComfyNode):
                    default=False,
                    tooltip="Enable resolution bucket mode. When enabled, expects pre-bucketed latents from ResolutionBucket node.",
                ),
                io.Boolean.Input(
                    "offloading",
                    default=False,
                    tooltip="",
                ),
            ],
            outputs=[
                io.Model.Output(
@ -855,7 +919,6 @@ class TrainLoraNode(io.ComfyNode):
        gradient_checkpointing,
        existing_lora,
        bucket_mode,
        offloading,
    ):
        # Extract scalars from lists (due to is_input_list=True)
        model = model[0]
@ -890,7 +953,9 @@ class TrainLoraNode(io.ComfyNode):
        mp.set_model_compute_dtype(dtype)
        # Prepare latents and compute counts
-        latents, num_images, multi_res = _prepare_latents_and_count(latents, dtype, bucket_mode)
+        latents, num_images, multi_res = _prepare_latents_and_count(
            latents, dtype, bucket_mode
        )
        # Validate and expand conditioning
        positive = _validate_and_expand_conditioning(positive, num_images, bucket_mode)
@ -905,7 +970,9 @@ class TrainLoraNode(io.ComfyNode):
            )
            # Create optimizer and loss function
-            optimizer = _create_optimizer(optimizer_name, lora_sd.values(), learning_rate)
+            optimizer = _create_optimizer(
                optimizer_name, lora_sd.values(), learning_rate
            )
            criterion = _create_loss_function(loss_function_name)
            # Setup gradient checkpointing
@ -918,8 +985,10 @@ class TrainLoraNode(io.ComfyNode):
            # Setup models for training
            mp.model.requires_grad_(False)
            torch.cuda.empty_cache()
            # With force_full_load=False we should be able to have offloading
            # But for offloading in training we need custom AutoGrad hooks for fwd/bwd
            comfy.model_management.load_models_gpu(
-                [mp], memory_required=1e20, force_full_load=not offloading
+                [mp], memory_required=1e20, force_full_load=True
            )
            torch.cuda.empty_cache()
@ -956,12 +1025,20 @@ class TrainLoraNode(io.ComfyNode):
                )
            # Setup guider
-            guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp)
+            guider = TrainGuider(mp)
            guider.set_conds(positive)
            # Run training loop
            try:
-                _run_training_loop(guider, train_sampler, latents, num_images, seed, bucket_mode, multi_res)
+                _run_training_loop(
                    guider,
                    train_sampler,
                    latents,
                    num_images,
                    seed,
                    bucket_mode,
                    multi_res,
                )
            finally:
                for m in mp.model.modules():
                    unpatch(m)
@ -977,7 +1054,7 @@ class TrainLoraNode(io.ComfyNode):
            return io.NodeOutput(mp, lora_sd, loss_map, steps + existing_steps)
-class LoraModelLoader(io.ComfyNode):
+class LoraModelLoader(io.ComfyNode):#
    @classmethod
    def define_schema(cls):
        return io.Schema(