Merge 7b7becc34c into 1265955b34

ops: handle multi-compute of the same weight (#13705 )
If the same weight is used multiple times within the same prefetch window, it should only apply compute state mutations once. Mark the weight as fully resident on the first pass accordingly.
2026-05-27 01:17:24 +08:00 · 2026-05-04 19:41:16 -04:00 · 2026-05-04 16:40:57 -07:00 · 2026-05-04 12:58:06 -07:00 · 2026-05-04 12:56:05 -07:00 · 2026-05-04 20:20:40 +03:00
7 changed files with 23 additions and 13 deletions
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -721,13 +721,15 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
    else:
        minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())

-    models_temp = set()
+    # Order-preserving dedup. A plain set() would randomize iteration order across runs
+    models_temp = {}
    for m in models:
-        models_temp.add(m)
+        models_temp[m] = None
        for mm in m.model_patches_models():
-            models_temp.add(mm)
+            models_temp[mm] = None

-    models = models_temp
+    models = list(models_temp)
+    models.reverse()

    models_to_load = []

--- a/comfy/model_prefetch.py
+++ b/comfy/model_prefetch.py
@ -37,7 +37,8 @@ def prefetch_queue_pop(queue, device, module):
    consumed = queue.pop(0)
    if consumed is not None:
        offload_stream, prefetch_state = consumed
-        offload_stream.wait_stream(comfy.model_management.current_stream(device))
+        if offload_stream is not None:
+            offload_stream.wait_stream(comfy.model_management.current_stream(device))
        _, comfy_modules = prefetch_state
        if comfy_modules is not None:
            cleanup_prefetched_modules(comfy_modules)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -253,6 +253,9 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w
    if bias is not None:
        bias = post_cast(s, "bias", bias, bias_dtype, prefetch["resident"], update_weight)

+    if prefetch["signature"] is not None:
+        prefetch["resident"] = True
+
    return weight, bias


--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@ -89,7 +89,8 @@ def get_additional_models(conds, dtype):
        gligen += get_models_from_cond(conds[k], "gligen")
        add_models += get_models_from_cond(conds[k], "additional_models")

-    control_nets = set(cnets)
+    # Order-preserving dedup. A plain set() would randomize iteration order across runs
+    control_nets = list(dict.fromkeys(cnets))

    inference_memory = 0
    control_models = []
--- a/comfy_extras/frame_interpolation_models/film_net.py
+++ b/comfy_extras/frame_interpolation_models/film_net.py
@ -199,6 +199,9 @@ class FILMNet(nn.Module):
    def get_dtype(self):
        return self.extract.extract_sublevels.convs[0][0].conv.weight.dtype

+    def memory_used_forward(self, shape, dtype):
+        return 1700 * shape[1] * shape[2] * dtype.itemsize
+
    def _build_warp_grids(self, H, W, device):
        """Pre-compute warp grids for all pyramid levels."""
        if (H, W) in self._warp_grids:
--- a/comfy_extras/frame_interpolation_models/ifnet.py
+++ b/comfy_extras/frame_interpolation_models/ifnet.py
@ -74,6 +74,9 @@ class IFNet(nn.Module):
    def get_dtype(self):
        return self.encode.cnn0.weight.dtype

+    def memory_used_forward(self, shape, dtype):
+        return 300 * shape[1] * shape[2] * dtype.itemsize
+
    def _build_warp_grids(self, H, W, device):
        if (H, W) in self._warp_grids:
            return
--- a/comfy_extras/nodes_frame_interpolation.py
+++ b/comfy_extras/nodes_frame_interpolation.py
@ -37,7 +37,7 @@ class FrameInterpolationModelLoader(io.ComfyNode):
        model = cls._detect_and_load(sd)
        dtype = torch.float16 if model_management.should_use_fp16(model_management.get_torch_device()) else torch.float32
        model.eval().to(dtype)
-        patcher = comfy.model_patcher.ModelPatcher(
+        patcher = comfy.model_patcher.CoreModelPatcher(
            model,
            load_device=model_management.get_torch_device(),
            offload_device=model_management.unet_offload_device(),
@ -98,16 +98,13 @@ class FrameInterpolate(io.ComfyNode):
        if num_frames < 2 or multiplier < 2:
            return io.NodeOutput(images)

-        model_management.load_model_gpu(interp_model)
        device = interp_model.load_device
        dtype = interp_model.model_dtype()
        inference_model = interp_model.model
-
-        # Free VRAM for inference activations (model weights + ~20x a single frame's worth)
-        H, W = images.shape[1], images.shape[2]
-        activation_mem = H * W * 3 * images.element_size() * 20
-        model_management.free_memory(activation_mem, device)
+        activation_mem = inference_model.memory_used_forward(images.shape, dtype)
+        model_management.load_models_gpu([interp_model], memory_required=activation_mem)
        align = getattr(inference_model, "pad_align", 1)
+        H, W = images.shape[1], images.shape[2]

        # Prepare a single padded frame on device for determining output dimensions
        def prepare_frame(idx):
Author	SHA1	Message	Date
huyua9	bd955f5085	Merge `7b7becc34c` into `1265955b34`	2026-05-04 19:41:16 -04:00
rattus	1265955b34	ops: handle multi-compute of the same weight (#13705 ) If the same weight is used multiple times within the same prefetch window, it should only apply compute state mutations once. Mark the weight as fully resident on the first pass accordingly.	2026-05-04 16:40:57 -07:00
rattus	1ac78180b3	make control-net load order deterministic (#13701 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Make this deterministic so speeds dont change base of load order. Load them in reverse order so whatever the caller lists first is the top priority.	2026-05-04 12:58:06 -07:00
rattus	c47633f3be	prefetch: guard against no offload (#13703 ) cast_ will return no stream if there is no work to do. guard against this is the consume logic.	2026-05-04 12:56:05 -07:00
Jukka Seppänen	c33d26c283	fix: Proper memory estimation for frame interpolation when not using dynamic VRAM (#13698 )	2026-05-04 20:20:40 +03:00