From fe053ba5eb34c8abcc5d17a25c114340af1833aa Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:37:17 -0800
Subject: [PATCH 01/25] mp: dont deep-clone objects from model_options (#12382)

If there are non-trivial python objects nested in the model_options, this
causes all sorts of issues. Traverse lists and dicts so clones can safely
overide settings and BYO objects but stop there on the deepclone.
---
 comfy/model_patcher.py |  3 +--
 comfy/utils.py         | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index b9a117a7c..19c9031ea 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -19,7 +19,6 @@
 from __future__ import annotations
 
 import collections
-import copy
 import inspect
 import logging
 import math
@@ -317,7 +316,7 @@ class ModelPatcher:
 
         n.object_patches = self.object_patches.copy()
         n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
-        n.model_options = copy.deepcopy(self.model_options)
+        n.model_options = comfy.utils.deepcopy_list_dict(self.model_options)
         n.backup = self.backup
         n.object_patches_backup = self.object_patches_backup
         n.parent = self
diff --git a/comfy/utils.py b/comfy/utils.py
index 1337e2205..edd80cebe 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -1376,3 +1376,21 @@ def string_to_seed(data):
             else:
                 crc >>= 1
     return crc ^ 0xFFFFFFFF
+
+def deepcopy_list_dict(obj, memo=None):
+    if memo is None:
+        memo = {}
+
+    obj_id = id(obj)
+    if obj_id in memo:
+        return memo[obj_id]
+
+    if isinstance(obj, dict):
+        res = {deepcopy_list_dict(k, memo): deepcopy_list_dict(v, memo) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        res = [deepcopy_list_dict(i, memo) for i in obj]
+    else:
+        res = obj
+
+    memo[obj_id] = res
+    return res

From f719f9c06266e7944683009b403e995d4c61d5f0 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:37:46 -0800
Subject: [PATCH 02/25] sd: delay VAE dtype archive until after override
 (#12388)

VAEs have host specific dtype logic that should override the dynamic
_model_dtype. Defer the archiving of model dtypes until after.
---
 comfy/sd.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index bc9407405..f65e7cadd 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -793,8 +793,6 @@ class VAE:
             self.first_stage_model = AutoencoderKL(**(config['params']))
         self.first_stage_model = self.first_stage_model.eval()
 
-        model_management.archive_model_dtypes(self.first_stage_model)
-
         if device is None:
             device = model_management.vae_device()
         self.device = device
@@ -803,6 +801,7 @@ class VAE:
             dtype = model_management.vae_dtype(self.device, self.working_dtypes)
         self.vae_dtype = dtype
         self.first_stage_model.to(self.vae_dtype)
+        model_management.archive_model_dtypes(self.first_stage_model)
         self.output_device = model_management.intermediate_device()
 
         mp = comfy.model_patcher.CoreModelPatcher

From 123a7874a97c4a8b8f06d4b7c2b1a566b8f0d057 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:38:28 -0800
Subject: [PATCH 03/25] ops: Fix vanilla-fp8 loaded lora quality (#12390)

This was missing the stochastic rounding required for fp8 downcast
to be consistent with model_patcher.patch_weight_to_device.

Missed in testing as I spend too much time with quantized tensors
and overlooked the simpler ones.
---
 comfy/ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/ops.py b/comfy/ops.py
index ea0d70702..33803b223 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -169,8 +169,8 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
                 if orig.dtype == dtype and len(fns) == 0:
                     #The layer actually wants our freshly saved QT
                     x = y
-            else:
-                y = x
+            elif update_weight:
+                y = comfy.float.stochastic_rounding(x, orig.dtype, seed = comfy.utils.string_to_seed(s.seed_key))
             if update_weight:
                 orig.copy_(y)
         for f in fns:

From 00fff6019ecf0f4306005579e93cef0cd51a3a1c Mon Sep 17 00:00:00 2001
From: guill <jacob.e.segal@gmail.com>
Date: Tue, 10 Feb 2026 14:37:14 -0800
Subject: [PATCH 04/25] feat(jobs): add 3d to PREVIEWABLE_MEDIA_TYPES for
 first-class 3D output support (#12381)

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
---
 comfy_execution/jobs.py      |  79 +++++++++++--
 tests/execution/test_jobs.py | 208 ++++++++++++++++++++++++++++++++++-
 2 files changed, 271 insertions(+), 16 deletions(-)

diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py
index bf091a448..370014fb6 100644
--- a/comfy_execution/jobs.py
+++ b/comfy_execution/jobs.py
@@ -20,10 +20,60 @@ class JobStatus:
 
 
 # Media types that can be previewed in the frontend
-PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio'})
+PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio', '3d'})
 
 # 3D file extensions for preview fallback (no dedicated media_type exists)
-THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb'})
+THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb', '.usdz'})
+
+
+def has_3d_extension(filename: str) -> bool:
+    lower = filename.lower()
+    return any(lower.endswith(ext) for ext in THREE_D_EXTENSIONS)
+
+
+def normalize_output_item(item):
+    """Normalize a single output list item for the jobs API.
+
+    Returns the normalized item, or None to exclude it.
+    String items with 3D extensions become {filename, type, subfolder} dicts.
+    """
+    if item is None:
+        return None
+    if isinstance(item, str):
+        if has_3d_extension(item):
+            return {'filename': item, 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+        return None
+    if isinstance(item, dict):
+        return item
+    return None
+
+
+def normalize_outputs(outputs: dict) -> dict:
+    """Normalize raw node outputs for the jobs API.
+
+    Transforms string 3D filenames into file output dicts and removes
+    None items. All other items (non-3D strings, dicts, etc.) are
+    preserved as-is.
+    """
+    normalized = {}
+    for node_id, node_outputs in outputs.items():
+        if not isinstance(node_outputs, dict):
+            normalized[node_id] = node_outputs
+            continue
+        normalized_node = {}
+        for media_type, items in node_outputs.items():
+            if media_type == 'animated' or not isinstance(items, list):
+                normalized_node[media_type] = items
+                continue
+            normalized_items = []
+            for item in items:
+                if item is None:
+                    continue
+                norm = normalize_output_item(item)
+                normalized_items.append(norm if norm is not None else item)
+            normalized_node[media_type] = normalized_items
+        normalized[node_id] = normalized_node
+    return normalized
 
 
 def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]:
@@ -45,9 +95,9 @@ def is_previewable(media_type: str, item: dict) -> bool:
     Maintains backwards compatibility with existing logic.
 
     Priority:
-    1. media_type is 'images', 'video', or 'audio'
+    1. media_type is 'images', 'video', 'audio', or '3d'
     2. format field starts with 'video/' or 'audio/'
-    3. filename has a 3D extension (.obj, .fbx, .gltf, .glb)
+    3. filename has a 3D extension (.obj, .fbx, .gltf, .glb, .usdz)
     """
     if media_type in PREVIEWABLE_MEDIA_TYPES:
         return True
@@ -139,7 +189,7 @@ def normalize_history_item(prompt_id: str, history_item: dict, include_outputs:
     })
 
     if include_outputs:
-        job['outputs'] = outputs
+        job['outputs'] = normalize_outputs(outputs)
         job['execution_status'] = status_info
         job['workflow'] = {
             'prompt': prompt,
@@ -171,18 +221,23 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]:
                 continue
 
             for item in items:
-                count += 1
-
-                if not isinstance(item, dict):
+                normalized = normalize_output_item(item)
+                if normalized is None:
                     continue
 
-                if preview_output is None and is_previewable(media_type, item):
+                count += 1
+
+                if preview_output is not None:
+                    continue
+
+                if isinstance(normalized, dict) and is_previewable(media_type, normalized):
                     enriched = {
-                        **item,
+                        **normalized,
                         'nodeId': node_id,
-                        'mediaType': media_type
                     }
-                    if item.get('type') == 'output':
+                    if 'mediaType' not in normalized:
+                        enriched['mediaType'] = media_type
+                    if normalized.get('type') == 'output':
                         preview_output = enriched
                     elif fallback_preview is None:
                         fallback_preview = enriched
diff --git a/tests/execution/test_jobs.py b/tests/execution/test_jobs.py
index 4d2f9ed36..83c36fe48 100644
--- a/tests/execution/test_jobs.py
+++ b/tests/execution/test_jobs.py
@@ -5,8 +5,11 @@ from comfy_execution.jobs import (
     is_previewable,
     normalize_queue_item,
     normalize_history_item,
+    normalize_output_item,
+    normalize_outputs,
     get_outputs_summary,
     apply_sorting,
+    has_3d_extension,
 )
 
 
@@ -35,8 +38,8 @@ class TestIsPreviewable:
     """Unit tests for is_previewable()"""
 
     def test_previewable_media_types(self):
-        """Images, video, audio media types should be previewable."""
-        for media_type in ['images', 'video', 'audio']:
+        """Images, video, audio, 3d media types should be previewable."""
+        for media_type in ['images', 'video', 'audio', '3d']:
             assert is_previewable(media_type, {}) is True
 
     def test_non_previewable_media_types(self):
@@ -46,7 +49,7 @@ class TestIsPreviewable:
 
     def test_3d_extensions_previewable(self):
         """3D file extensions should be previewable regardless of media_type."""
-        for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
             item = {'filename': f'model{ext}'}
             assert is_previewable('files', item) is True
 
@@ -160,7 +163,7 @@ class TestGetOutputsSummary:
 
     def test_3d_files_previewable(self):
         """3D file extensions should be previewable."""
-        for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
             outputs = {
                 'node1': {
                     'files': [{'filename': f'model{ext}', 'type': 'output'}]
@@ -192,6 +195,64 @@ class TestGetOutputsSummary:
         assert preview['mediaType'] == 'images'
         assert preview['subfolder'] == 'outputs'
 
+    def test_string_3d_filename_creates_preview(self):
+        """String items with 3D extensions should synthesize a preview (Preview3D node output).
+        Only the .glb counts — nulls and non-file strings are excluded."""
+        outputs = {
+            'node1': {
+                'result': ['preview3d_abc123.glb', None, None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert count == 1
+        assert preview is not None
+        assert preview['filename'] == 'preview3d_abc123.glb'
+        assert preview['mediaType'] == '3d'
+        assert preview['nodeId'] == 'node1'
+        assert preview['type'] == 'output'
+
+    def test_string_non_3d_filename_no_preview(self):
+        """String items without 3D extensions should not create a preview."""
+        outputs = {
+            'node1': {
+                'result': ['data.json', None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert count == 0
+        assert preview is None
+
+    def test_string_3d_filename_used_as_fallback(self):
+        """String 3D preview should be used when no dict items are previewable."""
+        outputs = {
+            'node1': {
+                'latents': [{'filename': 'latent.safetensors'}],
+            },
+            'node2': {
+                'result': ['model.glb', None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert preview is not None
+        assert preview['filename'] == 'model.glb'
+        assert preview['mediaType'] == '3d'
+
+
+class TestHas3DExtension:
+    """Unit tests for has_3d_extension()"""
+
+    def test_recognized_extensions(self):
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
+            assert has_3d_extension(f'model{ext}') is True
+
+    def test_case_insensitive(self):
+        assert has_3d_extension('MODEL.GLB') is True
+        assert has_3d_extension('Scene.GLTF') is True
+
+    def test_non_3d_extensions(self):
+        for name in ['photo.png', 'video.mp4', 'data.json', 'model']:
+            assert has_3d_extension(name) is False
+
 
 class TestApplySorting:
     """Unit tests for apply_sorting()"""
@@ -395,3 +456,142 @@ class TestNormalizeHistoryItem:
             'prompt': {'nodes': {'1': {}}},
             'extra_data': {'create_time': 1234567890, 'client_id': 'abc'},
         }
+
+    def test_include_outputs_normalizes_3d_strings(self):
+        """Detail view should transform string 3D filenames into file output dicts."""
+        history_item = {
+            'prompt': (
+                5,
+                'prompt-3d',
+                {'nodes': {}},
+                {'create_time': 1234567890},
+                ['node1'],
+            ),
+            'status': {'status_str': 'success', 'completed': True, 'messages': []},
+            'outputs': {
+                'node1': {
+                    'result': ['preview3d_abc123.glb', None, None]
+                }
+            },
+        }
+        job = normalize_history_item('prompt-3d', history_item, include_outputs=True)
+
+        assert job['outputs_count'] == 1
+        result_items = job['outputs']['node1']['result']
+        assert len(result_items) == 1
+        assert result_items[0] == {
+            'filename': 'preview3d_abc123.glb',
+            'type': 'output',
+            'subfolder': '',
+            'mediaType': '3d',
+        }
+
+    def test_include_outputs_preserves_dict_items(self):
+        """Detail view normalization should pass dict items through unchanged."""
+        history_item = {
+            'prompt': (
+                5,
+                'prompt-img',
+                {'nodes': {}},
+                {'create_time': 1234567890},
+                ['node1'],
+            ),
+            'status': {'status_str': 'success', 'completed': True, 'messages': []},
+            'outputs': {
+                'node1': {
+                    'images': [
+                        {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+                    ]
+                }
+            },
+        }
+        job = normalize_history_item('prompt-img', history_item, include_outputs=True)
+
+        assert job['outputs_count'] == 1
+        assert job['outputs']['node1']['images'] == [
+            {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+        ]
+
+
+class TestNormalizeOutputItem:
+    """Unit tests for normalize_output_item()"""
+
+    def test_none_returns_none(self):
+        assert normalize_output_item(None) is None
+
+    def test_string_3d_extension_synthesizes_dict(self):
+        result = normalize_output_item('model.glb')
+        assert result == {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+
+    def test_string_non_3d_extension_returns_none(self):
+        assert normalize_output_item('data.json') is None
+
+    def test_string_no_extension_returns_none(self):
+        assert normalize_output_item('camera_info_string') is None
+
+    def test_dict_passes_through(self):
+        item = {'filename': 'test.png', 'type': 'output'}
+        assert normalize_output_item(item) is item
+
+    def test_other_types_return_none(self):
+        assert normalize_output_item(42) is None
+        assert normalize_output_item(True) is None
+
+
+class TestNormalizeOutputs:
+    """Unit tests for normalize_outputs()"""
+
+    def test_empty_outputs(self):
+        assert normalize_outputs({}) == {}
+
+    def test_dict_items_pass_through(self):
+        outputs = {
+            'node1': {
+                'images': [{'filename': 'a.png', 'type': 'output'}],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == outputs
+
+    def test_3d_string_synthesized(self):
+        outputs = {
+            'node1': {
+                'result': ['model.glb', None, None],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == {
+            'node1': {
+                'result': [
+                    {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'},
+                ],
+            }
+        }
+
+    def test_animated_key_preserved(self):
+        outputs = {
+            'node1': {
+                'images': [{'filename': 'a.png', 'type': 'output'}],
+                'animated': [True],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result['node1']['animated'] == [True]
+
+    def test_non_dict_node_outputs_preserved(self):
+        outputs = {'node1': 'unexpected_value'}
+        result = normalize_outputs(outputs)
+        assert result == {'node1': 'unexpected_value'}
+
+    def test_none_items_filtered_but_other_types_preserved(self):
+        outputs = {
+            'node1': {
+                'result': ['data.json', None, [1, 2, 3]],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == {
+            'node1': {
+                'result': ['data.json', [1, 2, 3]],
+            }
+        }

From dbe70b6821994ce92d9cf211cc685862d0b6c0ca Mon Sep 17 00:00:00 2001
From: AustinMroz <austin@comfy.org>
Date: Tue, 10 Feb 2026 14:42:21 -0800
Subject: [PATCH 05/25] Add a VideoSlice node (#12107)

* Base TrimVideo implementation

* Raise error if as_trimmed call fails

* Bigger max start_time, tooltips, and formatting

* Count packets unless codec has subframes

* Remove incorrect nested decode

* Add null check for audio streams

* Support non-strict duration

* Added strict_duration bool to node definition

* Empty commit for approval

* Fix duration

* Support 5.1 audio layout on save

---------

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
---
 comfy_api/latest/_input/video_types.py      |  15 ++
 comfy_api/latest/_input_impl/video_types.py | 201 ++++++++++++++------
 comfy_extras/nodes_video.py                 |  51 +++++
 3 files changed, 207 insertions(+), 60 deletions(-)

diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py
index e634a0311..451e9526e 100644
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -34,6 +34,21 @@ class VideoInput(ABC):
         """
         pass
 
+    @abstractmethod
+    def as_trimmed(
+        self,
+        start_time: float | None = None,
+        duration: float | None = None,
+        strict_duration: bool = False,
+    ) -> VideoInput | None:
+        """
+        Create a new VideoInput which is trimmed to have the corresponding start_time and duration
+
+        Returns:
+            A new VideoInput, or None if the result would have negative duration
+        """
+        pass
+
     def get_stream_source(self) -> Union[str, io.BytesIO]:
         """
         Get a streamable source for the video. This allows processing without
diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py
index 1405d0b81..3463ed1c9 100644
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -6,6 +6,7 @@ from typing import Optional
 from .._input import AudioInput, VideoInput
 import av
 import io
+import itertools
 import json
 import numpy as np
 import math
@@ -29,7 +30,6 @@ def container_to_output_format(container_format: str | None) -> str | None:
     formats = container_format.split(",")
     return formats[0]
 
-
 def get_open_write_kwargs(
     dest: str | io.BytesIO, container_format: str, to_format: str | None
 ) -> dict:
@@ -57,12 +57,14 @@ class VideoFromFile(VideoInput):
     Class representing video input from a file.
     """
 
-    def __init__(self, file: str | io.BytesIO):
+    def __init__(self, file: str | io.BytesIO, *, start_time: float=0, duration: float=0):
         """
         Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
         containing the file contents.
         """
         self.__file = file
+        self.__start_time = start_time
+        self.__duration = duration
 
     def get_stream_source(self) -> str | io.BytesIO:
         """
@@ -96,6 +98,16 @@ class VideoFromFile(VideoInput):
         Returns:
             Duration in seconds
         """
+        raw_duration = self._get_raw_duration()
+        if self.__start_time < 0:
+            duration_from_start = min(raw_duration, -self.__start_time)
+        else:
+            duration_from_start = raw_duration - self.__start_time
+        if self.__duration:
+            return min(self.__duration, duration_from_start)
+        return duration_from_start
+
+    def _get_raw_duration(self) -> float:
         if isinstance(self.__file, io.BytesIO):
             self.__file.seek(0)
         with av.open(self.__file, mode="r") as container:
@@ -113,9 +125,13 @@ class VideoFromFile(VideoInput):
             if video_stream and video_stream.average_rate:
                 frame_count = 0
                 container.seek(0)
-                for packet in container.demux(video_stream):
-                    for _ in packet.decode():
-                        frame_count += 1
+                frame_iterator = (
+                    container.decode(video_stream)
+                    if video_stream.codec.capabilities & 0x100
+                    else container.demux(video_stream)
+                )
+                for packet in frame_iterator:
+                    frame_count += 1
                 if frame_count > 0:
                     return float(frame_count / video_stream.average_rate)
 
@@ -131,36 +147,54 @@ class VideoFromFile(VideoInput):
 
         with av.open(self.__file, mode="r") as container:
             video_stream = self._get_first_video_stream(container)
-            # 1. Prefer the frames field if available
-            if video_stream.frames and video_stream.frames > 0:
+            # 1. Prefer the frames field if available and usable
+            if (
+                video_stream.frames
+                and video_stream.frames > 0
+                and not self.__start_time
+                and not self.__duration
+            ):
                 return int(video_stream.frames)
 
             # 2. Try to estimate from duration and average_rate using only metadata
-            if container.duration is not None and video_stream.average_rate:
-                duration_seconds = float(container.duration / av.time_base)
-                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
-                if estimated_frames > 0:
-                    return estimated_frames
-
             if (
                 getattr(video_stream, "duration", None) is not None
                 and getattr(video_stream, "time_base", None) is not None
                 and video_stream.average_rate
             ):
-                duration_seconds = float(video_stream.duration * video_stream.time_base)
+                raw_duration = float(video_stream.duration * video_stream.time_base)
+                if self.__start_time < 0:
+                    duration_from_start = min(raw_duration, -self.__start_time)
+                else:
+                    duration_from_start = raw_duration - self.__start_time
+                duration_seconds = min(self.__duration, duration_from_start)
                 estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
                 if estimated_frames > 0:
                     return estimated_frames
 
             # 3. Last resort: decode frames and count them (streaming)
-            frame_count = 0
-            container.seek(0)
-            for packet in container.demux(video_stream):
-                for _ in packet.decode():
-                    frame_count += 1
-
-            if frame_count == 0:
-                raise ValueError(f"Could not determine frame count for file '{self.__file}'")
+            if self.__start_time < 0:
+                start_time = max(self._get_raw_duration() + self.__start_time, 0)
+            else:
+                start_time = self.__start_time
+            frame_count = 1
+            start_pts = int(start_time / video_stream.time_base)
+            end_pts = int((start_time + self.__duration) / video_stream.time_base)
+            container.seek(start_pts, stream=video_stream)
+            frame_iterator = (
+                container.decode(video_stream)
+                if video_stream.codec.capabilities & 0x100
+                else container.demux(video_stream)
+            )
+            for frame in frame_iterator:
+                if frame.pts >= start_pts:
+                    break
+            else:
+                raise ValueError(f"Could not determine frame count for file '{self.__file}'\nNo frames exist for start_time {self.__start_time}")
+            for frame in frame_iterator:
+                if frame.pts >= end_pts:
+                    break
+                frame_count += 1
             return frame_count
 
     def get_frame_rate(self) -> Fraction:
@@ -199,9 +233,21 @@ class VideoFromFile(VideoInput):
             return container.format.name
 
     def get_components_internal(self, container: InputContainer) -> VideoComponents:
+        video_stream = self._get_first_video_stream(container)
+        if self.__start_time < 0:
+            start_time = max(self._get_raw_duration() + self.__start_time, 0)
+        else:
+            start_time = self.__start_time
         # Get video frames
         frames = []
-        for frame in container.decode(video=0):
+        start_pts = int(start_time / video_stream.time_base)
+        end_pts = int((start_time + self.__duration) / video_stream.time_base)
+        container.seek(start_pts, stream=video_stream)
+        for frame in container.decode(video_stream):
+            if frame.pts < start_pts:
+                continue
+            if self.__duration and frame.pts >= end_pts:
+                break
             img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
             img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
             frames.append(img)
@@ -209,31 +255,44 @@ class VideoFromFile(VideoInput):
         images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
 
         # Get frame rate
-        video_stream = next(s for s in container.streams if s.type == 'video')
-        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
+        frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
 
         # Get audio if available
         audio = None
-        try:
-            container.seek(0)  # Reset the container to the beginning
-            for stream in container.streams:
-                if stream.type != 'audio':
-                    continue
-                assert isinstance(stream, av.AudioStream)
-                audio_frames = []
-                for packet in container.demux(stream):
-                    for frame in packet.decode():
-                        assert isinstance(frame, av.AudioFrame)
-                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
-                if len(audio_frames) > 0:
-                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
-                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
-                    audio = AudioInput({
-                        "waveform": audio_tensor,
-                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
-                    })
-        except StopIteration:
-            pass  # No audio stream
+        container.seek(start_pts, stream=video_stream)
+        # Use last stream for consistency
+        if len(container.streams.audio):
+            audio_stream = container.streams.audio[-1]
+            audio_frames = []
+            resample = av.audio.resampler.AudioResampler(format='fltp').resample
+            frames = itertools.chain.from_iterable(
+                map(resample, container.decode(audio_stream))
+            )
+
+            has_first_frame = False
+            for frame in frames:
+                offset_seconds = start_time - frame.pts * audio_stream.time_base
+                to_skip = int(offset_seconds * audio_stream.sample_rate)
+                if to_skip < frame.samples:
+                    has_first_frame = True
+                    break
+            if has_first_frame:
+                audio_frames.append(frame.to_ndarray()[..., to_skip:])
+
+            for frame in frames:
+                if frame.time > start_time + self.__duration:
+                    break
+                audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
+            if len(audio_frames) > 0:
+                audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
+                if self.__duration:
+                    audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
+
+                audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
+                audio = AudioInput({
+                    "waveform": audio_tensor,
+                    "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
+                })
 
         metadata = container.metadata
         return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
@@ -250,7 +309,7 @@ class VideoFromFile(VideoInput):
         path: str | io.BytesIO,
         format: VideoContainer = VideoContainer.AUTO,
         codec: VideoCodec = VideoCodec.AUTO,
-        metadata: Optional[dict] = None
+        metadata: Optional[dict] = None,
     ):
         if isinstance(self.__file, io.BytesIO):
             self.__file.seek(0)  # Reset the BytesIO object to the beginning
@@ -262,15 +321,14 @@ class VideoFromFile(VideoInput):
                 reuse_streams = False
             if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
                 reuse_streams = False
+            if self.__start_time or self.__duration:
+                reuse_streams = False
 
             if not reuse_streams:
                 components = self.get_components_internal(container)
                 video = VideoFromComponents(components)
                 return video.save_to(
-                    path,
-                    format=format,
-                    codec=codec,
-                    metadata=metadata
+                    path, format=format, codec=codec, metadata=metadata
                 )
 
             streams = container.streams
@@ -304,10 +362,21 @@ class VideoFromFile(VideoInput):
                         output_container.mux(packet)
 
     def _get_first_video_stream(self, container: InputContainer):
-        video_stream = next((s for s in container.streams if s.type == "video"), None)
-        if video_stream is None:
-            raise ValueError(f"No video stream found in file '{self.__file}'")
-        return video_stream
+        if len(container.streams.video):
+            return container.streams.video[0]
+        raise ValueError(f"No video stream found in file '{self.__file}'")
+
+    def as_trimmed(
+        self, start_time: float = 0, duration: float = 0, strict_duration: bool = True
+    ) -> VideoInput | None:
+        trimmed = VideoFromFile(
+            self.get_stream_source(),
+            start_time=start_time + self.__start_time,
+            duration=duration,
+        )
+        if trimmed.get_duration() < duration and strict_duration:
+            return None
+        return trimmed
 
 
 class VideoFromComponents(VideoInput):
@@ -322,7 +391,7 @@ class VideoFromComponents(VideoInput):
         return VideoComponents(
             images=self.__components.images,
             audio=self.__components.audio,
-            frame_rate=self.__components.frame_rate
+            frame_rate=self.__components.frame_rate,
         )
 
     def save_to(
@@ -330,7 +399,7 @@ class VideoFromComponents(VideoInput):
         path: str,
         format: VideoContainer = VideoContainer.AUTO,
         codec: VideoCodec = VideoCodec.AUTO,
-        metadata: Optional[dict] = None
+        metadata: Optional[dict] = None,
     ):
         if format != VideoContainer.AUTO and format != VideoContainer.MP4:
             raise ValueError("Only MP4 format is supported for now")
@@ -357,7 +426,10 @@ class VideoFromComponents(VideoInput):
             audio_stream: Optional[av.AudioStream] = None
             if self.__components.audio:
                 audio_sample_rate = int(self.__components.audio['sample_rate'])
-                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
+                waveform = self.__components.audio['waveform']
+                waveform = waveform[0, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
+                layout = {1: 'mono', 2: 'stereo', 6: '5.1'}.get(waveform.shape[0], 'stereo')
+                audio_stream = output.add_stream('aac', rate=audio_sample_rate, layout=layout)
 
             # Encode video
             for i, frame in enumerate(self.__components.images):
@@ -372,12 +444,21 @@ class VideoFromComponents(VideoInput):
             output.mux(packet)
 
             if audio_stream and self.__components.audio:
-                waveform = self.__components.audio['waveform']
-                waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
-                frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().cpu().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
+                frame = av.AudioFrame.from_ndarray(waveform.float().cpu().numpy(), format='fltp', layout=layout)
                 frame.sample_rate = audio_sample_rate
                 frame.pts = 0
                 output.mux(audio_stream.encode(frame))
 
                 # Flush encoder
                 output.mux(audio_stream.encode(None))
+
+    def as_trimmed(
+        self,
+        start_time: float | None = None,
+        duration: float | None = None,
+        strict_duration: bool = True,
+    ) -> VideoInput | None:
+        if self.get_duration() < start_time + duration:
+            return None
+        #TODO Consider tracking duration and trimming at time of save?
+        return VideoFromFile(self.get_stream_source(), start_time=start_time, duration=duration)
diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py
index ccf7b63d3..cd765a7c1 100644
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -202,6 +202,56 @@ class LoadVideo(io.ComfyNode):
 
         return True
 
+class VideoSlice(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="Video Slice",
+            display_name="Video Slice",
+            search_aliases=[
+                "trim video duration",
+                "skip first frames",
+                "frame load cap",
+                "start time",
+            ],
+            category="image/video",
+            inputs=[
+                io.Video.Input("video"),
+                io.Float.Input(
+                    "start_time",
+                    default=0.0,
+                    max=1e5,
+                    min=-1e5,
+                    step=0.001,
+                    tooltip="Start time in seconds",
+                ),
+                io.Float.Input(
+                    "duration",
+                    default=0.0,
+                    min=0.0,
+                    step=0.001,
+                    tooltip="Duration in seconds, or 0 for unlimited duration",
+                ),
+                io.Boolean.Input(
+                    "strict_duration",
+                    default=False,
+                    tooltip="If True, when the specified duration is not possible, an error will be raised.",
+                ),
+            ],
+            outputs=[
+                io.Video.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, video: io.Video.Type, start_time: float, duration: float, strict_duration: bool) -> io.NodeOutput:
+        trimmed = video.as_trimmed(start_time, duration, strict_duration=strict_duration)
+        if trimmed is not None:
+            return io.NodeOutput(trimmed)
+        raise ValueError(
+            f"Failed to slice video:\nSource duration: {video.get_duration()}\nStart time: {start_time}\nTarget duration: {duration}"
+        )
+
 
 class VideoExtension(ComfyExtension):
     @override
@@ -212,6 +262,7 @@ class VideoExtension(ComfyExtension):
             CreateVideo,
             GetVideoComponents,
             LoadVideo,
+            VideoSlice,
         ]
 
 async def comfy_entrypoint() -> VideoExtension:

From cdcf4119b3e826bd69fa986772485fb5b44a54cd Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 11 Feb 2026 10:45:19 +0800
Subject: [PATCH 06/25] [Trainer] training with proper offloading (#12189)

* Fix bypass dtype/device moving

* Force offloading mode for training

* training context var

* offloading implementation in training node

* fix wrong input type

* Support bypass load lora model, correct adapter/offloading handling
---
 comfy/ldm/flux/math.py         |  39 +++++---
 comfy/model_management.py      |   5 +
 comfy/sampler_helpers.py       |  16 +++-
 comfy/weight_adapter/bypass.py |  20 ++--
 comfy_extras/nodes_train.py    | 162 ++++++++++++++++++++++++++++-----
 5 files changed, 196 insertions(+), 46 deletions(-)

diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py
index f9597de5b..5e764bb46 100644
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -29,19 +29,34 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     return out.to(dtype=torch.float32, device=pos.device)
 
 
+def _apply_rope1(x: Tensor, freqs_cis: Tensor):
+    x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
+
+    x_out = freqs_cis[..., 0] * x_[..., 0]
+    x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
+
+    return x_out.reshape(*x.shape).type_as(x)
+
+
+def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
+    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+
+
 try:
     import comfy.quant_ops
-    apply_rope = comfy.quant_ops.ck.apply_rope
-    apply_rope1 = comfy.quant_ops.ck.apply_rope1
+    q_apply_rope = comfy.quant_ops.ck.apply_rope
+    q_apply_rope1 = comfy.quant_ops.ck.apply_rope1
+    def apply_rope(xq, xk, freqs_cis):
+        if comfy.model_management.in_training:
+            return _apply_rope(xq, xk, freqs_cis)
+        else:
+            return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+    def apply_rope1(x, freqs_cis):
+        if comfy.model_management.in_training:
+            return _apply_rope1(x, freqs_cis)
+        else:
+            return q_apply_rope1(x, freqs_cis)
 except:
     logging.warning("No comfy kitchen, using old apply_rope functions.")
-    def apply_rope1(x: Tensor, freqs_cis: Tensor):
-        x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
-
-        x_out = freqs_cis[..., 0] * x_[..., 0]
-        x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
-
-        return x_out.reshape(*x.shape).type_as(x)
-
-    def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-        return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+    apply_rope = _apply_rope
+    apply_rope1 = _apply_rope1
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6018c1ab6..304931eb0 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -55,6 +55,11 @@ cpu_state = CPUState.GPU
 
 total_vram = 0
 
+
+# Training Related State
+in_training = False
+
+
 def get_supported_float8_types():
     float8_types = []
     try:
diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py
index 9134e6d71..1f75f2ba7 100644
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -122,20 +122,26 @@ def estimate_memory(model, noise_shape, conds):
     minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
     return memory_required, minimum_memory_required
 
-def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False):
+def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
     executor = comfy.patcher_extension.WrapperExecutor.new_executor(
         _prepare_sampling,
         comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
     )
-    return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load)
+    return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload)
 
-def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False):
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
     real_model: BaseModel = None
     models, inference_memory = get_additional_models(conds, model.model_dtype())
     models += get_additional_models_from_model_options(model_options)
     models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
-    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
-    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory, force_full_load=force_full_load)
+    if force_offload: # In training + offload enabled, we want to force prepare sampling to trigger partial load
+        memory_required = 1e20
+        minimum_memory_required = None
+    else:
+        memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
+        memory_required += inference_memory
+        minimum_memory_required += inference_memory
+    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
     real_model = model.model
 
     return real_model, conds, models
diff --git a/comfy/weight_adapter/bypass.py b/comfy/weight_adapter/bypass.py
index d4aaf98ca..b9d5ec7d9 100644
--- a/comfy/weight_adapter/bypass.py
+++ b/comfy/weight_adapter/bypass.py
@@ -21,6 +21,7 @@ from typing import Optional, Union
 import torch
 import torch.nn as nn
 
+import comfy.model_management
 from .base import WeightAdapterBase, WeightAdapterTrainBase
 from comfy.patcher_extension import PatcherInjection
 
@@ -181,18 +182,21 @@ class BypassForwardHook:
             )
             return  # Already injected
 
-        # Move adapter weights to module's device to avoid CPU-GPU transfer on every forward
-        device = None
+        # Move adapter weights to compute device (GPU)
+        # Use get_torch_device() instead of module.weight.device because
+        # with offloading, module weights may be on CPU while compute happens on GPU
+        device = comfy.model_management.get_torch_device()
+
+        # Get dtype from module weight if available
         dtype = None
         if hasattr(self.module, "weight") and self.module.weight is not None:
-            device = self.module.weight.device
             dtype = self.module.weight.dtype
-        elif hasattr(self.module, "W_q"):  # Quantized layers might use different attr
-            device = self.module.W_q.device
-            dtype = self.module.W_q.dtype
 
-        if device is not None:
-            self._move_adapter_weights_to_device(device, dtype)
+        # Only use dtype if it's a standard float type, not quantized
+        if dtype is not None and dtype not in (torch.float32, torch.float16, torch.bfloat16):
+            dtype = None
+
+        self._move_adapter_weights_to_device(device, dtype)
 
         self.original_forward = self.module.forward
         self.module.forward = self._bypass_forward
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index 024a89391..630eedc9f 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -4,6 +4,7 @@ import os
 import numpy as np
 import safetensors
 import torch
+import torch.nn as nn
 import torch.utils.checkpoint
 from tqdm.auto import trange
 from PIL import Image, ImageDraw, ImageFont
@@ -27,6 +28,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic):
     """
     CFGGuider with modifications for training specific logic
     """
+
+    def __init__(self, *args, offloading=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.offloading = offloading
+
     def outer_sample(
         self,
         noise,
@@ -45,9 +51,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic):
                 noise.shape,
                 self.conds,
                 self.model_options,
-                force_full_load=True,  # mirror behavior in TrainLoraNode.execute() to keep model loaded
+                force_full_load=not self.offloading,
+                force_offload=self.offloading,
             )
         )
+        torch.cuda.empty_cache()
         device = self.model_patcher.load_device
 
         if denoise_mask is not None:
@@ -404,16 +412,97 @@ def find_all_highest_child_module_with_forward(
     return result
 
 
-def patch(m):
+def find_modules_at_depth(
+    model: nn.Module, depth: int = 1, result=None, current_depth=0, name=None
+) -> list[nn.Module]:
+    """
+    Find modules at a specific depth level for gradient checkpointing.
+
+    Args:
+        model: The model to search
+        depth: Target depth level (1 = top-level blocks, 2 = their children, etc.)
+        result: Accumulator for results
+        current_depth: Current recursion depth
+        name: Current module name for logging
+
+    Returns:
+        List of modules at the target depth
+    """
+    if result is None:
+        result = []
+    name = name or "root"
+
+    # Skip container modules (they don't have meaningful forward)
+    is_container = isinstance(model, (nn.ModuleList, nn.Sequential, nn.ModuleDict))
+    has_forward = hasattr(model, "forward") and not is_container
+
+    if has_forward:
+        current_depth += 1
+        if current_depth == depth:
+            result.append(model)
+            logging.debug(f"Found module at depth {depth}: {name} ({model.__class__.__name__})")
+            return result
+
+    # Recurse into children
+    for next_name, child in model.named_children():
+        find_modules_at_depth(child, depth, result, current_depth, f"{name}.{next_name}")
+
+    return result
+
+
+class OffloadCheckpointFunction(torch.autograd.Function):
+    """
+    Gradient checkpointing that works with weight offloading.
+
+    Forward: no_grad -> compute -> weights can be freed
+    Backward: enable_grad -> recompute -> backward -> weights can be freed
+
+    For single input, single output modules (Linear, Conv*).
+    """
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, forward_fn):
+        ctx.save_for_backward(x)
+        ctx.forward_fn = forward_fn
+        with torch.no_grad():
+            return forward_fn(x)
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor):
+        x, = ctx.saved_tensors
+        forward_fn = ctx.forward_fn
+
+        # Clear context early
+        ctx.forward_fn = None
+
+        with torch.enable_grad():
+            x_detached = x.detach().requires_grad_(True)
+            y = forward_fn(x_detached)
+            y.backward(grad_out)
+            grad_x = x_detached.grad
+
+        # Explicit cleanup
+        del y, x_detached, forward_fn
+
+        return grad_x, None
+
+
+def patch(m, offloading=False):
     if not hasattr(m, "forward"):
         return
     org_forward = m.forward
 
-    def fwd(args, kwargs):
-        return org_forward(*args, **kwargs)
+    # Branch 1: Linear/Conv* -> offload-compatible checkpoint (single input/output)
+    if offloading and isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        def checkpointing_fwd(x):
+            return OffloadCheckpointFunction.apply(x, org_forward)
+    # Branch 2: Others -> standard checkpoint
+    else:
+        def fwd(args, kwargs):
+            return org_forward(*args, **kwargs)
 
-    def checkpointing_fwd(*args, **kwargs):
-        return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False)
+        def checkpointing_fwd(*args, **kwargs):
+            return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False)
 
     m.org_forward = org_forward
     m.forward = checkpointing_fwd
@@ -936,6 +1025,18 @@ class TrainLoraNode(io.ComfyNode):
                     default=True,
                     tooltip="Use gradient checkpointing for training.",
                 ),
+                io.Int.Input(
+                    "checkpoint_depth",
+                    default=1,
+                    min=1,
+                    max=5,
+                    tooltip="Depth level for gradient checkpointing.",
+                ),
+                io.Boolean.Input(
+                    "offloading",
+                    default=False,
+                    tooltip="Depth level for gradient checkpointing.",
+                ),
                 io.Combo.Input(
                     "existing_lora",
                     options=folder_paths.get_filename_list("loras") + ["[None]"],
@@ -982,6 +1083,8 @@ class TrainLoraNode(io.ComfyNode):
         lora_dtype,
         algorithm,
         gradient_checkpointing,
+        checkpoint_depth,
+        offloading,
         existing_lora,
         bucket_mode,
         bypass_mode,
@@ -1000,6 +1103,8 @@ class TrainLoraNode(io.ComfyNode):
         lora_dtype = lora_dtype[0]
         algorithm = algorithm[0]
         gradient_checkpointing = gradient_checkpointing[0]
+        offloading = offloading[0]
+        checkpoint_depth = checkpoint_depth[0]
         existing_lora = existing_lora[0]
         bucket_mode = bucket_mode[0]
         bypass_mode = bypass_mode[0]
@@ -1054,16 +1159,18 @@ class TrainLoraNode(io.ComfyNode):
 
             # Setup gradient checkpointing
             if gradient_checkpointing:
-                for m in find_all_highest_child_module_with_forward(
-                    mp.model.diffusion_model
-                ):
-                    patch(m)
+                modules_to_patch = find_modules_at_depth(
+                    mp.model.diffusion_model, depth=checkpoint_depth
+                )
+                logging.info(f"Gradient checkpointing: patching {len(modules_to_patch)} modules at depth {checkpoint_depth}")
+                for m in modules_to_patch:
+                    patch(m, offloading=offloading)
 
             torch.cuda.empty_cache()
             # With force_full_load=False we should be able to have offloading
             # But for offloading in training we need custom AutoGrad hooks for fwd/bwd
             comfy.model_management.load_models_gpu(
-                [mp], memory_required=1e20, force_full_load=True
+                [mp], memory_required=1e20, force_full_load=not offloading
             )
             torch.cuda.empty_cache()
 
@@ -1100,7 +1207,7 @@ class TrainLoraNode(io.ComfyNode):
                 )
 
             # Setup guider
-            guider = TrainGuider(mp)
+            guider = TrainGuider(mp, offloading=offloading)
             guider.set_conds(positive)
 
             # Inject bypass hooks if bypass mode is enabled
@@ -1113,6 +1220,7 @@ class TrainLoraNode(io.ComfyNode):
 
             # Run training loop
             try:
+                comfy.model_management.in_training = True
                 _run_training_loop(
                     guider,
                     train_sampler,
@@ -1123,6 +1231,7 @@ class TrainLoraNode(io.ComfyNode):
                     multi_res,
                 )
             finally:
+                comfy.model_management.in_training = False
                 # Eject bypass hooks if they were injected
                 if bypass_injections is not None:
                     for injection in bypass_injections:
@@ -1132,19 +1241,20 @@ class TrainLoraNode(io.ComfyNode):
                     unpatch(m)
             del train_sampler, optimizer
 
-            # Finalize adapters
+            for param in lora_sd:
+                lora_sd[param] = lora_sd[param].to(lora_dtype).detach()
+
             for adapter in all_weight_adapters:
                 adapter.requires_grad_(False)
-
-            for param in lora_sd:
-                lora_sd[param] = lora_sd[param].to(lora_dtype)
+                del adapter
+            del all_weight_adapters
 
             # mp in train node is highly specialized for training
             # use it in inference will result in bad behavior so we don't return it
             return io.NodeOutput(lora_sd, loss_map, steps + existing_steps)
 
 
-class LoraModelLoader(io.ComfyNode):#
+class LoraModelLoader(io.ComfyNode):
     @classmethod
     def define_schema(cls):
         return io.Schema(
@@ -1166,6 +1276,11 @@ class LoraModelLoader(io.ComfyNode):#
                     max=100.0,
                     tooltip="How strongly to modify the diffusion model. This value can be negative.",
                 ),
+                io.Boolean.Input(
+                    "bypass",
+                    default=False,
+                    tooltip="When enabled, applies LoRA in bypass mode without modifying base model weights. Useful for training and when model weights are offloaded.",
+                ),
             ],
             outputs=[
                 io.Model.Output(
@@ -1175,13 +1290,18 @@ class LoraModelLoader(io.ComfyNode):#
         )
 
     @classmethod
-    def execute(cls, model, lora, strength_model):
+    def execute(cls, model, lora, strength_model, bypass=False):
         if strength_model == 0:
             return io.NodeOutput(model)
 
-        model_lora, _ = comfy.sd.load_lora_for_models(
-            model, None, lora, strength_model, 0
-        )
+        if bypass:
+            model_lora, _ = comfy.sd.load_bypass_lora_for_models(
+                model, None, lora, strength_model, 0
+            )
+        else:
+            model_lora, _ = comfy.sd.load_lora_for_models(
+                model, None, lora, strength_model, 0
+            )
         return io.NodeOutput(model_lora)
 
 

From 76a7fa96dbdc2eda89218601fe3aed5997df055f Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 10 Feb 2026 19:04:32 -0800
Subject: [PATCH 07/25] Make built in lora training work on anima. (#12402)

---
 comfy/ldm/anima/model.py | 16 ++++++++++++++--
 comfy/model_base.py      | 12 ++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/comfy/ldm/anima/model.py b/comfy/ldm/anima/model.py
index 2e6ed58fa..6fb51c4a4 100644
--- a/comfy/ldm/anima/model.py
+++ b/comfy/ldm/anima/model.py
@@ -195,8 +195,20 @@ class Anima(MiniTrainDIT):
         super().__init__(*args, **kwargs)
         self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))
 
-    def preprocess_text_embeds(self, text_embeds, text_ids):
+    def preprocess_text_embeds(self, text_embeds, text_ids, t5xxl_weights=None):
         if text_ids is not None:
-            return self.llm_adapter(text_embeds, text_ids)
+            out = self.llm_adapter(text_embeds, text_ids)
+            if t5xxl_weights is not None:
+                out = out * t5xxl_weights
+
+            if out.shape[1] < 512:
+                out = torch.nn.functional.pad(out, (0, 0, 0, 512 - out.shape[1]))
+            return out
         else:
             return text_embeds
+
+    def forward(self, x, timesteps, context, **kwargs):
+        t5xxl_ids = kwargs.pop("t5xxl_ids", None)
+        if t5xxl_ids is not None:
+            context = self.preprocess_text_embeds(context, t5xxl_ids, t5xxl_weights=kwargs.pop("t5xxl_weights", None))
+        return super().forward(x, timesteps, context, **kwargs)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 858789b30..4a74cb1ce 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1160,12 +1160,16 @@ class Anima(BaseModel):
         device = kwargs["device"]
         if cross_attn is not None:
             if t5xxl_ids is not None:
-                cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.unsqueeze(0).to(device=device))
                 if t5xxl_weights is not None:
-                    cross_attn *= t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+                    t5xxl_weights = t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+                t5xxl_ids = t5xxl_ids.unsqueeze(0)
+
+                if torch.is_inference_mode_enabled():  # if not we are training
+                    cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype()))
+                else:
+                    out['t5xxl_ids'] = comfy.conds.CONDRegular(t5xxl_ids)
+                    out['t5xxl_weights'] = comfy.conds.CONDRegular(t5xxl_weights)
 
-                if cross_attn.shape[1] < 512:
-                    cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, 0, 512 - cross_attn.shape[1]))
             out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
         return out
 

From 2c7cef4a23c08e3f02a33c693d927158a15a11f6 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 11 Feb 2026 20:51:49 +0200
Subject: [PATCH 08/25] fix(api-nodes): retry on connection errors during
 polling instead of aborting (#12393)

---
 comfy_api_nodes/util/client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py
index 8a1259506..391748e7a 100644
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@@ -143,9 +143,9 @@ async def poll_op(
     poll_interval: float = 5.0,
     max_poll_attempts: int = 160,
     timeout_per_poll: float = 120.0,
-    max_retries_per_poll: int = 3,
+    max_retries_per_poll: int = 10,
     retry_delay_per_poll: float = 1.0,
-    retry_backoff_per_poll: float = 2.0,
+    retry_backoff_per_poll: float = 1.4,
     estimated_duration: int | None = None,
     cancel_endpoint: ApiEndpoint | None = None,
     cancel_timeout: float = 10.0,
@@ -240,9 +240,9 @@ async def poll_op_raw(
     poll_interval: float = 5.0,
     max_poll_attempts: int = 160,
     timeout_per_poll: float = 120.0,
-    max_retries_per_poll: int = 3,
+    max_retries_per_poll: int = 10,
     retry_delay_per_poll: float = 1.0,
-    retry_backoff_per_poll: float = 2.0,
+    retry_backoff_per_poll: float = 1.4,
     estimated_duration: int | None = None,
     cancel_endpoint: ApiEndpoint | None = None,
     cancel_timeout: float = 10.0,

From 4993411fd9a43d642971925272c3748d9e058131 Mon Sep 17 00:00:00 2001
From: Benjamin Lu <benjaminlu1107@gmail.com>
Date: Wed, 11 Feb 2026 11:15:13 -0800
Subject: [PATCH 09/25] Dispatch desktop auto-bump when a ComfyUI release is
 published (#12398)

* Dispatch desktop auto-bump on ComfyUI release publish

* Fix release webhook secret checks in step conditions

* Require desktop dispatch token in release webhook

* Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Luke Mino-Altherr <lminoaltherr@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
---
 .github/workflows/release-webhook.yml | 36 +++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/.github/workflows/release-webhook.yml b/.github/workflows/release-webhook.yml
index 6fceb7560..737e4c488 100644
--- a/.github/workflows/release-webhook.yml
+++ b/.github/workflows/release-webhook.yml
@@ -7,6 +7,8 @@ on:
 jobs:
   send-webhook:
     runs-on: ubuntu-latest
+    env:
+      DESKTOP_REPO_DISPATCH_TOKEN: ${{ secrets.DESKTOP_REPO_DISPATCH_TOKEN }}
     steps:
       - name: Send release webhook
         env:
@@ -106,3 +108,37 @@ jobs:
             --fail --silent --show-error
           
           echo "✅ Release webhook sent successfully"
+
+      - name: Send repository dispatch to desktop
+        env:
+          DISPATCH_TOKEN: ${{ env.DESKTOP_REPO_DISPATCH_TOKEN }}
+          RELEASE_TAG: ${{ github.event.release.tag_name }}
+          RELEASE_URL: ${{ github.event.release.html_url }}
+        run: |
+          set -euo pipefail
+
+          if [ -z "${DISPATCH_TOKEN:-}" ]; then
+            echo "::error::DESKTOP_REPO_DISPATCH_TOKEN is required but not set."
+            exit 1
+          fi
+
+          PAYLOAD="$(jq -n \
+            --arg release_tag "$RELEASE_TAG" \
+            --arg release_url "$RELEASE_URL" \
+            '{
+              event_type: "comfyui_release_published",
+              client_payload: {
+                release_tag: $release_tag,
+                release_url: $release_url
+              }
+            }')"
+
+          curl -fsSL \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer ${DISPATCH_TOKEN}" \
+            https://api.github.com/repos/Comfy-Org/desktop/dispatches \
+            -d "$PAYLOAD"
+
+          echo "✅ Dispatched ComfyUI release ${RELEASE_TAG} to Comfy-Org/desktop"

From 2b7cc7e3b69127a81b9232d4e8305eb678fa3d0c Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 11 Feb 2026 21:30:19 +0200
Subject: [PATCH 10/25] [API Nodes] enable Magnific Upscalers (#12179)

* feat(api-nodes): enable Magnific Upscalers

* update price badges

---------

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
---
 comfy_api_nodes/nodes_magnific.py | 62 +++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 7 deletions(-)

diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py
index 013e71cc8..83a581c5d 100644
--- a/comfy_api_nodes/nodes_magnific.py
+++ b/comfy_api_nodes/nodes_magnific.py
@@ -30,6 +30,30 @@ from comfy_api_nodes.util import (
     validate_image_dimensions,
 )
 
+_EUR_TO_USD = 1.19
+
+
+def _tier_price_eur(megapixels: float) -> float:
+    """Price in EUR for a single Magnific upscaling step based on input megapixels."""
+    if megapixels <= 1.3:
+        return 0.143
+    if megapixels <= 3.0:
+        return 0.286
+    if megapixels <= 6.4:
+        return 0.429
+    return 1.716
+
+
+def _calculate_magnific_upscale_price_usd(width: int, height: int, scale: int) -> float:
+    """Calculate total Magnific upscale price in USD for given input dimensions and scale factor."""
+    num_steps = int(math.log2(scale))
+    total_eur = 0.0
+    pixels = width * height
+    for _ in range(num_steps):
+        total_eur += _tier_price_eur(pixels / 1_000_000)
+        pixels *= 4
+    return round(total_eur * _EUR_TO_USD, 2)
+
 
 class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
     @classmethod
@@ -103,11 +127,20 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
             ],
             is_api_node=True,
             price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]),
+                depends_on=IO.PriceBadgeDepends(widgets=["scale_factor", "auto_downscale"]),
                 expr="""
                 (
-                  $max := widgets.scale_factor = "2x" ? 1.326 : 1.657;
-                  {"type": "range_usd", "min_usd": 0.11, "max_usd": $max}
+                  $ad := widgets.auto_downscale;
+                  $mins := $ad
+                    ? {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.515}
+                    : {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.844};
+                  $maxs := {"2x": 0.515, "4x": 0.844, "8x": 1.015, "16x": 1.187};
+                  {
+                    "type": "range_usd",
+                    "min_usd": $lookup($mins, widgets.scale_factor),
+                    "max_usd": $lookup($maxs, widgets.scale_factor),
+                    "format": { "approximate": true }
+                  }
                 )
                 """,
             ),
@@ -168,6 +201,10 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
                     f"Use a smaller input image or lower scale factor."
                 )
 
+        final_height, final_width = get_image_dimensions(image)
+        actual_scale = int(scale_factor.rstrip("x"))
+        price_usd = _calculate_magnific_upscale_price_usd(final_width, final_height, actual_scale)
+
         initial_res = await sync_op(
             cls,
             ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler", method="POST"),
@@ -189,6 +226,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
             ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler/{initial_res.task_id}"),
             response_model=TaskResponse,
             status_extractor=lambda x: x.status,
+            price_extractor=lambda _: price_usd,
             poll_interval=10.0,
             max_poll_attempts=480,
         )
@@ -257,8 +295,14 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
                 depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]),
                 expr="""
                 (
-                  $max := widgets.scale_factor = "2x" ? 1.326 : 1.657;
-                  {"type": "range_usd", "min_usd": 0.11, "max_usd": $max}
+                  $mins := {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.844};
+                  $maxs := {"2x": 2.045, "4x": 2.545, "8x": 2.889, "16x": 3.06};
+                  {
+                    "type": "range_usd",
+                    "min_usd": $lookup($mins, widgets.scale_factor),
+                    "max_usd": $lookup($maxs, widgets.scale_factor),
+                    "format": { "approximate": true }
+                  }
                 )
                 """,
             ),
@@ -321,6 +365,9 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
                     f"Use a smaller input image or lower scale factor."
                 )
 
+        final_height, final_width = get_image_dimensions(image)
+        price_usd = _calculate_magnific_upscale_price_usd(final_width, final_height, requested_scale)
+
         initial_res = await sync_op(
             cls,
             ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler-precision-v2", method="POST"),
@@ -339,6 +386,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
             ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler-precision-v2/{initial_res.task_id}"),
             response_model=TaskResponse,
             status_extractor=lambda x: x.status,
+            price_extractor=lambda _: price_usd,
             poll_interval=10.0,
             max_poll_attempts=480,
         )
@@ -877,8 +925,8 @@ class MagnificExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[IO.ComfyNode]]:
         return [
-            # MagnificImageUpscalerCreativeNode,
-            # MagnificImageUpscalerPreciseV2Node,
+            MagnificImageUpscalerCreativeNode,
+            MagnificImageUpscalerPreciseV2Node,
             MagnificImageStyleTransferNode,
             MagnificImageRelightNode,
             MagnificImageSkinEnhancerNode,

From d297a749a2fa3a34ebff898797feef161bcd64c6 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:50:16 -0800
Subject: [PATCH 11/25] dynamic_vram: Fix windows Aimdo crash + Fix LLM
 performance (#12408)

* model_management: lazy-cache aimdo_tensor

These tensors cosntructed from aimdo-allocations are CPU expensive to
make on the pytorch side. Add a cache version that will be valid with
signature match to fast path past whatever torch is doing.

* dynamic_vram: Minimize fast path CPU work

Move as much as possible inside the not resident if block and cache
the formed weight and bias rather than the flat intermediates. In
extreme layer weight rates this adds up.
---
 comfy/model_management.py |  8 ++++++--
 comfy/model_patcher.py    |  2 --
 comfy/ops.py              | 21 ++++++++++++++-------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 304931eb0..38c3e482b 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1213,8 +1213,12 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
 
         signature = comfy_aimdo.model_vbar.vbar_fault(weight._v)
         if signature is not None:
-            v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0]
-            if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature):
+            if comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature):
+                v_tensor = weight._v_tensor
+            else:
+                raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device)
+                v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0]
+                weight._v_tensor = v_tensor
                 weight._v_signature = signature
                 #Send it over
                 v_tensor.copy_(weight, non_blocking=non_blocking)
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 19c9031ea..224e218e3 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1542,7 +1542,6 @@ class ModelPatcherDynamic(ModelPatcher):
 
                     if vbar is not None and not hasattr(m, "_v"):
                         m._v = vbar.alloc(v_weight_size)
-                        m._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(m._v, device_to)
                     allocated_size += v_weight_size
 
                 else:
@@ -1557,7 +1556,6 @@ class ModelPatcherDynamic(ModelPatcher):
                         weight_size = geometry.numel() * geometry.element_size()
                         if vbar is not None and not hasattr(weight, "_v"):
                             weight._v = vbar.alloc(weight_size)
-                            weight._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device_to)
                             weight._model_dtype = model_dtype
                         allocated_size += weight_size
                     vbar.set_watermark_limit(allocated_size)
diff --git a/comfy/ops.py b/comfy/ops.py
index 33803b223..688937e43 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -83,14 +83,18 @@ def cast_to_input(weight, input, non_blocking=False, copy=True):
 def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype):
     offload_stream = None
     xfer_dest = None
-    cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
 
     signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
-    if signature is not None:
-        xfer_dest = s._v_tensor
     resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
+    if signature is not None:
+        if resident:
+            weight = s._v_weight
+            bias = s._v_bias
+        else:
+            xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
 
     if not resident:
+        cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
         cast_dest = None
 
         xfer_source = [ s.weight, s.bias ]
@@ -140,9 +144,13 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
                     post_cast.copy_(pre_cast)
             xfer_dest = cast_dest
 
-    params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
-    weight = params[0]
-    bias = params[1]
+        params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
+        weight = params[0]
+        bias = params[1]
+        if signature is not None:
+            s._v_weight = weight
+            s._v_bias = bias
+        s._v_signature=signature
 
     def post_cast(s, param_key, x, dtype, resident, update_weight):
         lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
@@ -182,7 +190,6 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
     weight = post_cast(s, "weight", weight, dtype, resident, update_weight)
     if s.bias is not None:
         bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight)
-    s._v_signature=signature
 
     #FIXME: weird offload return protocol
     return weight, bias, (offload_stream, device if signature is not None else None, None)

From 2a4328d639810858aa625c7bfedb974a13a57abe Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:53:42 -0800
Subject: [PATCH 12/25] ace15: Use dynamic_vram friendly trange (#12409)

Factor out the ksampler trange and use it in ACE LLM to prevent the
silent stall at 0 and rate distortion due to first-step model load.
---
 comfy/k_diffusion/sampling.py | 32 ++------------------------------
 comfy/text_encoders/ace15.py  |  3 +--
 comfy/utils.py                | 27 +++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index c0c51d51a..6978eb717 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -1,12 +1,11 @@
 import math
-import time
 from functools import partial
 
 from scipy import integrate
 import torch
 from torch import nn
 import torchsde
-from tqdm.auto import trange as trange_, tqdm
+from tqdm.auto import tqdm
 
 from . import utils
 from . import deis
@@ -15,34 +14,7 @@ import comfy.model_patcher
 import comfy.model_sampling
 
 import comfy.memory_management
-
-
-def trange(*args, **kwargs):
-    if comfy.memory_management.aimdo_allocator is None:
-        return trange_(*args, **kwargs)
-
-    pbar = trange_(*args, **kwargs, smoothing=1.0)
-    pbar._i = 0
-    pbar.set_postfix_str("  Model Initializing ...  ")
-
-    _update = pbar.update
-
-    def warmup_update(n=1):
-        pbar._i += 1
-        if pbar._i == 1:
-            pbar.i1_time = time.time()
-            pbar.set_postfix_str(" Model Initialization complete!  ")
-        elif pbar._i == 2:
-            #bring forward the effective start time based the the diff between first and second iteration
-            #to attempt to remove load overhead from the final step rate estimate.
-            pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
-            pbar.set_postfix_str("")
-
-        _update(n)
-
-    pbar.update = warmup_update
-    return pbar
-
+from comfy.utils import model_trange as trange
 
 def append_zero(x):
     return torch.cat([x, x.new_zeros([1])])
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 73697b3c1..b8198a820 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,7 +3,6 @@ import comfy.text_encoders.llama
 from comfy import sd1_clip
 import torch
 import math
-from tqdm.auto import trange
 import yaml
 import comfy.utils
 
@@ -52,7 +51,7 @@ def sample_manual_loop_no_classes(
 
     progress_bar = comfy.utils.ProgressBar(max_new_tokens)
 
-    for step in trange(max_new_tokens, desc="LM sampling"):
+    for step in comfy.utils.model_trange(max_new_tokens, desc="LM sampling"):
         outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values)
         next_token_logits = model.transformer.logits(outputs[0])[:, -1]
         past_key_values = outputs[2]
diff --git a/comfy/utils.py b/comfy/utils.py
index edd80cebe..e0a94e2e1 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -27,6 +27,7 @@ from PIL import Image
 import logging
 import itertools
 from torch.nn.functional import interpolate
+from tqdm.auto import trange
 from einops import rearrange
 from comfy.cli_args import args, enables_dynamic_vram
 import json
@@ -1155,6 +1156,32 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
 def tiled_scale(samples, function, tile_x=64, tile_y=64, overlap = 8, upscale_amount = 4, out_channels = 3, output_device="cpu", pbar = None):
     return tiled_scale_multidim(samples, function, (tile_y, tile_x), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=output_device, pbar=pbar)
 
+def model_trange(*args, **kwargs):
+    if comfy.memory_management.aimdo_allocator is None:
+        return trange(*args, **kwargs)
+
+    pbar = trange(*args, **kwargs, smoothing=1.0)
+    pbar._i = 0
+    pbar.set_postfix_str("  Model Initializing ...  ")
+
+    _update = pbar.update
+
+    def warmup_update(n=1):
+        pbar._i += 1
+        if pbar._i == 1:
+            pbar.i1_time = time.time()
+            pbar.set_postfix_str(" Model Initialization complete!  ")
+        elif pbar._i == 2:
+            #bring forward the effective start time based the the diff between first and second iteration
+            #to attempt to remove load overhead from the final step rate estimate.
+            pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
+            pbar.set_postfix_str("")
+
+        _update(n)
+
+    pbar.update = warmup_update
+    return pbar
+
 PROGRESS_BAR_ENABLED = True
 def set_progress_bar_enabled(enabled):
     global PROGRESS_BAR_ENABLED

From 3fe61cedda090c744dcf6f579ed48744fa66ef5f Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:54:02 -0800
Subject: [PATCH 13/25] model_patcher: guard against none model_dtype (#12410)

Handle the case where the _model_dtype exists but is none with the
intended fallback.
---
 comfy/model_patcher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 224e218e3..f278fccac 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1525,7 +1525,7 @@ class ModelPatcherDynamic(ModelPatcher):
                     setattr(m, param_key + "_function", weight_function)
                     geometry = weight
                     if not isinstance(weight, QuantizedTensor):
-                        model_dtype = getattr(m, param_key + "_comfy_model_dtype", weight.dtype)
+                        model_dtype = getattr(m, param_key + "_comfy_model_dtype", None) or weight.dtype
                         weight._model_dtype = model_dtype
                         geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype)
                     return comfy.memory_management.vram_aligned_size(geometry)
@@ -1551,7 +1551,7 @@ class ModelPatcherDynamic(ModelPatcher):
                         weight.seed_key = key
                         set_dirty(weight, dirty)
                         geometry = weight
-                        model_dtype = getattr(m, param + "_comfy_model_dtype", weight.dtype)
+                        model_dtype = getattr(m, param + "_comfy_model_dtype", None) or weight.dtype
                         geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype)
                         weight_size = geometry.numel() * geometry.element_size()
                         if vbar is not None and not hasattr(weight, "_v"):

From e5ae670a4016d3698a806e7f840fcecc50639848 Mon Sep 17 00:00:00 2001
From: askmyteapot <62238146+askmyteapot@users.noreply.github.com>
Date: Thu, 12 Feb 2026 11:28:48 +1000
Subject: [PATCH 14/25] Update ace15.py to allow min_p sampling (#12373)

---
 comfy/text_encoders/ace15.py | 15 ++++++++++++---
 comfy_extras/nodes_ace.py    |  5 +++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index b8198a820..0fdd4669f 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -16,6 +16,7 @@ def sample_manual_loop_no_classes(
     temperature: float = 0.85,
     top_p: float = 0.9,
     top_k: int = None,
+    min_p: float = 0.000,
     seed: int = 1,
     min_tokens: int = 1,
     max_new_tokens: int = 2048,
@@ -80,6 +81,12 @@ def sample_manual_loop_no_classes(
             min_val = top_k_vals[..., -1, None]
             cfg_logits[cfg_logits < min_val] = remove_logit_value
 
+        if min_p is not None and min_p > 0:
+            probs = torch.softmax(cfg_logits, dim=-1)
+            p_max = probs.max(dim=-1, keepdim=True).values
+            indices_to_remove = probs < (min_p * p_max)
+            cfg_logits[indices_to_remove] = remove_logit_value
+
         if top_p is not None and top_p < 1.0:
             sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
             cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
@@ -110,7 +117,7 @@ def sample_manual_loop_no_classes(
     return output_audio_codes
 
 
-def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0):
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0, min_p=0.000):
     positive = [[token for token, _ in inner_list] for inner_list in positive]
     positive = positive[0]
 
@@ -134,7 +141,7 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
         paddings = []
         ids = [positive]
 
-    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
 
 
 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
@@ -192,6 +199,7 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
         temperature = kwargs.get("temperature", 0.85)
         top_p = kwargs.get("top_p", 0.9)
         top_k = kwargs.get("top_k", 0.0)
+        min_p = kwargs.get("min_p", 0.000)
 
         duration = math.ceil(duration)
         kwargs["duration"] = duration
@@ -239,6 +247,7 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
                               "temperature": temperature,
                               "top_p": top_p,
                               "top_k": top_k,
+                              "min_p": min_p,
                               }
         return out
 
@@ -299,7 +308,7 @@ class ACE15TEModel(torch.nn.Module):
 
         lm_metadata = token_weight_pairs["lm_metadata"]
         if lm_metadata["generate_audio_codes"]:
-            audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["max_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
+            audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"], min_p=lm_metadata["min_p"])
             out["audio_codes"] = [audio_codes]
 
         return base_out, None, out
diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py
index dde5bbd2a..9cf84ab4d 100644
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -49,13 +49,14 @@ class TextEncodeAceStepAudio15(io.ComfyNode):
                 io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
                 io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
                 io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
+                io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
             ],
             outputs=[io.Conditioning.Output()],
         )
 
     @classmethod
-    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k) -> io.NodeOutput:
-        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k)
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
+        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
         conditioning = clip.encode_from_tokens_scheduled(tokens)
         return io.NodeOutput(conditioning)
 

From 66c18522fbcde5b62731e3fb080a84b14e3dacfc Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 11 Feb 2026 19:12:16 -0800
Subject: [PATCH 15/25] Add a tip for common error. (#12414)

---
 execution.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/execution.py b/execution.py
index 896862c6b..f549a2f0f 100644
--- a/execution.py
+++ b/execution.py
@@ -623,6 +623,8 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
             logging.info("Memory summary: {}".format(comfy.model_management.debug_memory_summary()))
             logging.error("Got an OOM, unloading all loaded models.")
             comfy.model_management.unload_all_models()
+        elif isinstance(ex, RuntimeError) and ("mat1 and mat2 shapes" in str(ex)) and "Sampler" in class_type:
+            tips = "\n\nTIPS: If you have any \"Load CLIP\" or \"*CLIP Loader\" nodes in your workflow connected to this sampler node make sure the correct file(s) and type is selected."
 
         error_details = {
             "node_id": real_node_id,

From 4a93a62371b64f9d11a140a09faf985c48902d2e Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Thu, 12 Feb 2026 11:38:51 +0200
Subject: [PATCH 16/25] fix(api-nodes): add separate retry budget for 429 rate
 limit responses (#12421)

---
 comfy_api_nodes/util/client.py           | 207 ++++++++++++-----------
 comfy_api_nodes/util/download_helpers.py |  30 ++--
 comfy_api_nodes/util/request_logger.py   |  66 ++++----
 comfy_api_nodes/util/upload_helpers.py   |  55 +++---
 4 files changed, 177 insertions(+), 181 deletions(-)

diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py
index 391748e7a..94886af7b 100644
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@@ -57,6 +57,7 @@ class _RequestConfig:
     files: dict[str, Any] | list[tuple[str, Any]] | None
     multipart_parser: Callable | None
     max_retries: int
+    max_retries_on_rate_limit: int
     retry_delay: float
     retry_backoff: float
     wait_label: str = "Waiting"
@@ -65,6 +66,7 @@ class _RequestConfig:
     final_label_on_success: str | None = "Completed"
     progress_origin_ts: float | None = None
     price_extractor: Callable[[dict[str, Any]], float | None] | None = None
+    is_rate_limited: Callable[[int, Any], bool] | None = None
 
 
 @dataclass
@@ -78,7 +80,7 @@ class _PollUIState:
     active_since: float | None = None  # start time of current active interval (None if queued)
 
 
-_RETRY_STATUS = {408, 429, 500, 502, 503, 504}
+_RETRY_STATUS = {408, 500, 502, 503, 504}  # status 429 is handled separately
 COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"]
 FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"]
 QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing"]
@@ -103,6 +105,8 @@ async def sync_op(
     final_label_on_success: str | None = "Completed",
     progress_origin_ts: float | None = None,
     monitor_progress: bool = True,
+    max_retries_on_rate_limit: int = 16,
+    is_rate_limited: Callable[[int, Any], bool] | None = None,
 ) -> M:
     raw = await sync_op_raw(
         cls,
@@ -122,6 +126,8 @@ async def sync_op(
         final_label_on_success=final_label_on_success,
         progress_origin_ts=progress_origin_ts,
         monitor_progress=monitor_progress,
+        max_retries_on_rate_limit=max_retries_on_rate_limit,
+        is_rate_limited=is_rate_limited,
     )
     if not isinstance(raw, dict):
         raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).")
@@ -194,6 +200,8 @@ async def sync_op_raw(
     final_label_on_success: str | None = "Completed",
     progress_origin_ts: float | None = None,
     monitor_progress: bool = True,
+    max_retries_on_rate_limit: int = 16,
+    is_rate_limited: Callable[[int, Any], bool] | None = None,
 ) -> dict[str, Any] | bytes:
     """
     Make a single network request.
@@ -222,6 +230,8 @@ async def sync_op_raw(
         final_label_on_success=final_label_on_success,
         progress_origin_ts=progress_origin_ts,
         price_extractor=price_extractor,
+        max_retries_on_rate_limit=max_retries_on_rate_limit,
+        is_rate_limited=is_rate_limited,
     )
     return await _request_base(cfg, expect_binary=as_binary)
 
@@ -506,7 +516,7 @@ def _friendly_http_message(status: int, body: Any) -> str:
     if status == 409:
         return "There is a problem with your account. Please contact support@comfy.org."
     if status == 429:
-        return "Rate Limit Exceeded: Please try again later."
+        return "Rate Limit Exceeded: The server returned 429 after all retry attempts. Please wait and try again."
     try:
         if isinstance(body, dict):
             err = body.get("error")
@@ -586,6 +596,8 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
     start_time = cfg.progress_origin_ts if cfg.progress_origin_ts is not None else time.monotonic()
     attempt = 0
     delay = cfg.retry_delay
+    rate_limit_attempts = 0
+    rate_limit_delay = cfg.retry_delay
     operation_succeeded: bool = False
     final_elapsed_seconds: int | None = None
     extracted_price: float | None = None
@@ -653,17 +665,14 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                 payload_headers["Content-Type"] = "application/json"
                 payload_kw["json"] = cfg.data or {}
 
-            try:
-                request_logger.log_request_response(
-                    operation_id=operation_id,
-                    request_method=method,
-                    request_url=url,
-                    request_headers=dict(payload_headers) if payload_headers else None,
-                    request_params=dict(params) if params else None,
-                    request_data=request_body_log,
-                )
-            except Exception as _log_e:
-                logging.debug("[DEBUG] request logging failed: %s", _log_e)
+            request_logger.log_request_response(
+                operation_id=operation_id,
+                request_method=method,
+                request_url=url,
+                request_headers=dict(payload_headers) if payload_headers else None,
+                request_params=dict(params) if params else None,
+                request_data=request_body_log,
+            )
 
             req_coro = sess.request(method, url, params=params, **payload_kw)
             req_task = asyncio.create_task(req_coro)
@@ -688,41 +697,33 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                         body = await resp.json()
                     except (ContentTypeError, json.JSONDecodeError):
                         body = await resp.text()
-                    if resp.status in _RETRY_STATUS and attempt <= cfg.max_retries:
+                    should_retry = False
+                    wait_time = 0.0
+                    retry_label = ""
+                    is_rl = resp.status == 429 or (
+                        cfg.is_rate_limited is not None and cfg.is_rate_limited(resp.status, body)
+                    )
+                    if is_rl and rate_limit_attempts < cfg.max_retries_on_rate_limit:
+                        rate_limit_attempts += 1
+                        wait_time = min(rate_limit_delay, 30.0)
+                        rate_limit_delay *= cfg.retry_backoff
+                        retry_label = f"rate-limit retry {rate_limit_attempts} of {cfg.max_retries_on_rate_limit}"
+                        should_retry = True
+                    elif resp.status in _RETRY_STATUS and (attempt - rate_limit_attempts) <= cfg.max_retries:
+                        wait_time = delay
+                        delay *= cfg.retry_backoff
+                        retry_label = f"retry {attempt - rate_limit_attempts} of {cfg.max_retries}"
+                        should_retry = True
+
+                    if should_retry:
                         logging.warning(
-                            "HTTP %s %s -> %s. Retrying in %.2fs (retry %d of %d).",
+                            "HTTP %s %s -> %s. Waiting %.2fs (%s).",
                             method,
                             url,
                             resp.status,
-                            delay,
-                            attempt,
-                            cfg.max_retries,
+                            wait_time,
+                            retry_label,
                         )
-                        try:
-                            request_logger.log_request_response(
-                                operation_id=operation_id,
-                                request_method=method,
-                                request_url=url,
-                                response_status_code=resp.status,
-                                response_headers=dict(resp.headers),
-                                response_content=body,
-                                error_message=_friendly_http_message(resp.status, body),
-                            )
-                        except Exception as _log_e:
-                            logging.debug("[DEBUG] response logging failed: %s", _log_e)
-
-                        await sleep_with_interrupt(
-                            delay,
-                            cfg.node_cls,
-                            cfg.wait_label if cfg.monitor_progress else None,
-                            start_time if cfg.monitor_progress else None,
-                            cfg.estimated_total,
-                            display_callback=_display_time_progress if cfg.monitor_progress else None,
-                        )
-                        delay *= cfg.retry_backoff
-                        continue
-                    msg = _friendly_http_message(resp.status, body)
-                    try:
                         request_logger.log_request_response(
                             operation_id=operation_id,
                             request_method=method,
@@ -730,10 +731,27 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                             response_status_code=resp.status,
                             response_headers=dict(resp.headers),
                             response_content=body,
-                            error_message=msg,
+                            error_message=f"HTTP {resp.status} ({retry_label}, will retry in {wait_time:.1f}s)",
                         )
-                    except Exception as _log_e:
-                        logging.debug("[DEBUG] response logging failed: %s", _log_e)
+                        await sleep_with_interrupt(
+                            wait_time,
+                            cfg.node_cls,
+                            cfg.wait_label if cfg.monitor_progress else None,
+                            start_time if cfg.monitor_progress else None,
+                            cfg.estimated_total,
+                            display_callback=_display_time_progress if cfg.monitor_progress else None,
+                        )
+                        continue
+                    msg = _friendly_http_message(resp.status, body)
+                    request_logger.log_request_response(
+                        operation_id=operation_id,
+                        request_method=method,
+                        request_url=url,
+                        response_status_code=resp.status,
+                        response_headers=dict(resp.headers),
+                        response_content=body,
+                        error_message=msg,
+                    )
                     raise Exception(msg)
 
                 if expect_binary:
@@ -753,17 +771,14 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                     bytes_payload = bytes(buff)
                     operation_succeeded = True
                     final_elapsed_seconds = int(time.monotonic() - start_time)
-                    try:
-                        request_logger.log_request_response(
-                            operation_id=operation_id,
-                            request_method=method,
-                            request_url=url,
-                            response_status_code=resp.status,
-                            response_headers=dict(resp.headers),
-                            response_content=bytes_payload,
-                        )
-                    except Exception as _log_e:
-                        logging.debug("[DEBUG] response logging failed: %s", _log_e)
+                    request_logger.log_request_response(
+                        operation_id=operation_id,
+                        request_method=method,
+                        request_url=url,
+                        response_status_code=resp.status,
+                        response_headers=dict(resp.headers),
+                        response_content=bytes_payload,
+                    )
                     return bytes_payload
                 else:
                     try:
@@ -780,45 +795,39 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                         extracted_price = cfg.price_extractor(payload) if cfg.price_extractor else None
                     operation_succeeded = True
                     final_elapsed_seconds = int(time.monotonic() - start_time)
-                    try:
-                        request_logger.log_request_response(
-                            operation_id=operation_id,
-                            request_method=method,
-                            request_url=url,
-                            response_status_code=resp.status,
-                            response_headers=dict(resp.headers),
-                            response_content=response_content_to_log,
-                        )
-                    except Exception as _log_e:
-                        logging.debug("[DEBUG] response logging failed: %s", _log_e)
+                    request_logger.log_request_response(
+                        operation_id=operation_id,
+                        request_method=method,
+                        request_url=url,
+                        response_status_code=resp.status,
+                        response_headers=dict(resp.headers),
+                        response_content=response_content_to_log,
+                    )
                     return payload
 
         except ProcessingInterrupted:
             logging.debug("Polling was interrupted by user")
             raise
         except (ClientError, OSError) as e:
-            if attempt <= cfg.max_retries:
+            if (attempt - rate_limit_attempts) <= cfg.max_retries:
                 logging.warning(
                     "Connection error calling %s %s. Retrying in %.2fs (%d/%d): %s",
                     method,
                     url,
                     delay,
-                    attempt,
+                    attempt - rate_limit_attempts,
                     cfg.max_retries,
                     str(e),
                 )
-                try:
-                    request_logger.log_request_response(
-                        operation_id=operation_id,
-                        request_method=method,
-                        request_url=url,
-                        request_headers=dict(payload_headers) if payload_headers else None,
-                        request_params=dict(params) if params else None,
-                        request_data=request_body_log,
-                        error_message=f"{type(e).__name__}: {str(e)} (will retry)",
-                    )
-                except Exception as _log_e:
-                    logging.debug("[DEBUG] request error logging failed: %s", _log_e)
+                request_logger.log_request_response(
+                    operation_id=operation_id,
+                    request_method=method,
+                    request_url=url,
+                    request_headers=dict(payload_headers) if payload_headers else None,
+                    request_params=dict(params) if params else None,
+                    request_data=request_body_log,
+                    error_message=f"{type(e).__name__}: {str(e)} (will retry)",
+                )
                 await sleep_with_interrupt(
                     delay,
                     cfg.node_cls,
@@ -831,23 +840,6 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                 continue
             diag = await _diagnose_connectivity()
             if not diag["internet_accessible"]:
-                try:
-                    request_logger.log_request_response(
-                        operation_id=operation_id,
-                        request_method=method,
-                        request_url=url,
-                        request_headers=dict(payload_headers) if payload_headers else None,
-                        request_params=dict(params) if params else None,
-                        request_data=request_body_log,
-                        error_message=f"LocalNetworkError: {str(e)}",
-                    )
-                except Exception as _log_e:
-                    logging.debug("[DEBUG] final error logging failed: %s", _log_e)
-                raise LocalNetworkError(
-                    "Unable to connect to the API server due to local network issues. "
-                    "Please check your internet connection and try again."
-                ) from e
-            try:
                 request_logger.log_request_response(
                     operation_id=operation_id,
                     request_method=method,
@@ -855,10 +847,21 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                     request_headers=dict(payload_headers) if payload_headers else None,
                     request_params=dict(params) if params else None,
                     request_data=request_body_log,
-                    error_message=f"ApiServerError: {str(e)}",
+                    error_message=f"LocalNetworkError: {str(e)}",
                 )
-            except Exception as _log_e:
-                logging.debug("[DEBUG] final error logging failed: %s", _log_e)
+                raise LocalNetworkError(
+                    "Unable to connect to the API server due to local network issues. "
+                    "Please check your internet connection and try again."
+                ) from e
+            request_logger.log_request_response(
+                operation_id=operation_id,
+                request_method=method,
+                request_url=url,
+                request_headers=dict(payload_headers) if payload_headers else None,
+                request_params=dict(params) if params else None,
+                request_data=request_body_log,
+                error_message=f"ApiServerError: {str(e)}",
+            )
             raise ApiServerError(
                 f"The API server at {default_base_url()} is currently unreachable. "
                 f"The service may be experiencing issues."
diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py
index 78bcf1fa1..aa588d038 100644
--- a/comfy_api_nodes/util/download_helpers.py
+++ b/comfy_api_nodes/util/download_helpers.py
@@ -167,27 +167,25 @@ async def download_url_to_bytesio(
                     with contextlib.suppress(Exception):
                         dest.seek(0)
 
-                with contextlib.suppress(Exception):
-                    request_logger.log_request_response(
-                        operation_id=op_id,
-                        request_method="GET",
-                        request_url=url,
-                        response_status_code=resp.status,
-                        response_headers=dict(resp.headers),
-                        response_content=f"[streamed {written} bytes to dest]",
-                    )
+                request_logger.log_request_response(
+                    operation_id=op_id,
+                    request_method="GET",
+                    request_url=url,
+                    response_status_code=resp.status,
+                    response_headers=dict(resp.headers),
+                    response_content=f"[streamed {written} bytes to dest]",
+                )
                 return
         except asyncio.CancelledError:
             raise ProcessingInterrupted("Task cancelled") from None
         except (ClientError, OSError) as e:
             if attempt <= max_retries:
-                with contextlib.suppress(Exception):
-                    request_logger.log_request_response(
-                        operation_id=op_id,
-                        request_method="GET",
-                        request_url=url,
-                        error_message=f"{type(e).__name__}: {str(e)} (will retry)",
-                    )
+                request_logger.log_request_response(
+                    operation_id=op_id,
+                    request_method="GET",
+                    request_url=url,
+                    error_message=f"{type(e).__name__}: {str(e)} (will retry)",
+                )
                 await sleep_with_interrupt(delay, cls, None, None, None)
                 delay *= retry_backoff
                 continue
diff --git a/comfy_api_nodes/util/request_logger.py b/comfy_api_nodes/util/request_logger.py
index e0cb4428d..fe0543d9b 100644
--- a/comfy_api_nodes/util/request_logger.py
+++ b/comfy_api_nodes/util/request_logger.py
@@ -8,7 +8,6 @@ from typing import Any
 
 import folder_paths
 
-# Get the logger instance
 logger = logging.getLogger(__name__)
 
 
@@ -91,38 +90,41 @@ def log_request_response(
     Filenames are sanitized and length-limited for cross-platform safety.
     If we still fail to write, we fall back to appending into api.log.
     """
-    log_dir = get_log_directory()
-    filepath = _build_log_filepath(log_dir, operation_id, request_url)
-
-    log_content: list[str] = []
-    log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}")
-    log_content.append(f"Operation ID: {operation_id}")
-    log_content.append("-" * 30 + " REQUEST " + "-" * 30)
-    log_content.append(f"Method: {request_method}")
-    log_content.append(f"URL: {request_url}")
-    if request_headers:
-        log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}")
-    if request_params:
-        log_content.append(f"Params:\n{_format_data_for_logging(request_params)}")
-    if request_data is not None:
-        log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}")
-
-    log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30)
-    if response_status_code is not None:
-        log_content.append(f"Status Code: {response_status_code}")
-    if response_headers:
-        log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}")
-    if response_content is not None:
-        log_content.append(f"Content:\n{_format_data_for_logging(response_content)}")
-    if error_message:
-        log_content.append(f"Error:\n{error_message}")
-
     try:
-        with open(filepath, "w", encoding="utf-8") as f:
-            f.write("\n".join(log_content))
-        logger.debug("API log saved to: %s", filepath)
-    except Exception as e:
-        logger.error("Error writing API log to %s: %s", filepath, str(e))
+        log_dir = get_log_directory()
+        filepath = _build_log_filepath(log_dir, operation_id, request_url)
+
+        log_content: list[str] = []
+        log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}")
+        log_content.append(f"Operation ID: {operation_id}")
+        log_content.append("-" * 30 + " REQUEST " + "-" * 30)
+        log_content.append(f"Method: {request_method}")
+        log_content.append(f"URL: {request_url}")
+        if request_headers:
+            log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}")
+        if request_params:
+            log_content.append(f"Params:\n{_format_data_for_logging(request_params)}")
+        if request_data is not None:
+            log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}")
+
+        log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30)
+        if response_status_code is not None:
+            log_content.append(f"Status Code: {response_status_code}")
+        if response_headers:
+            log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}")
+        if response_content is not None:
+            log_content.append(f"Content:\n{_format_data_for_logging(response_content)}")
+        if error_message:
+            log_content.append(f"Error:\n{error_message}")
+
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write("\n".join(log_content))
+            logger.debug("API log saved to: %s", filepath)
+        except Exception as e:
+            logger.error("Error writing API log to %s: %s", filepath, str(e))
+    except Exception as _log_e:
+        logging.debug("[DEBUG] log_request_response failed: %s", _log_e)
 
 
 if __name__ == '__main__':
diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py
index 83d936ce1..7cc565263 100644
--- a/comfy_api_nodes/util/upload_helpers.py
+++ b/comfy_api_nodes/util/upload_helpers.py
@@ -255,17 +255,14 @@ async def upload_file(
         monitor_task = asyncio.create_task(_monitor())
         sess: aiohttp.ClientSession | None = None
         try:
-            try:
-                request_logger.log_request_response(
-                    operation_id=operation_id,
-                    request_method="PUT",
-                    request_url=upload_url,
-                    request_headers=headers or None,
-                    request_params=None,
-                    request_data=f"[File data {len(data)} bytes]",
-                )
-            except Exception as e:
-                logging.debug("[DEBUG] upload request logging failed: %s", e)
+            request_logger.log_request_response(
+                operation_id=operation_id,
+                request_method="PUT",
+                request_url=upload_url,
+                request_headers=headers or None,
+                request_params=None,
+                request_data=f"[File data {len(data)} bytes]",
+            )
 
             sess = aiohttp.ClientSession(timeout=timeout)
             req = sess.put(upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers)
@@ -311,31 +308,27 @@ async def upload_file(
                         delay *= retry_backoff
                         continue
                     raise Exception(f"Failed to upload (HTTP {resp.status}).")
-                try:
-                    request_logger.log_request_response(
-                        operation_id=operation_id,
-                        request_method="PUT",
-                        request_url=upload_url,
-                        response_status_code=resp.status,
-                        response_headers=dict(resp.headers),
-                        response_content="File uploaded successfully.",
-                    )
-                except Exception as e:
-                    logging.debug("[DEBUG] upload response logging failed: %s", e)
+                request_logger.log_request_response(
+                    operation_id=operation_id,
+                    request_method="PUT",
+                    request_url=upload_url,
+                    response_status_code=resp.status,
+                    response_headers=dict(resp.headers),
+                    response_content="File uploaded successfully.",
+                )
                 return
         except asyncio.CancelledError:
             raise ProcessingInterrupted("Task cancelled") from None
         except (aiohttp.ClientError, OSError) as e:
             if attempt <= max_retries:
-                with contextlib.suppress(Exception):
-                    request_logger.log_request_response(
-                        operation_id=operation_id,
-                        request_method="PUT",
-                        request_url=upload_url,
-                        request_headers=headers or None,
-                        request_data=f"[File data {len(data)} bytes]",
-                        error_message=f"{type(e).__name__}: {str(e)} (will retry)",
-                    )
+                request_logger.log_request_response(
+                    operation_id=operation_id,
+                    request_method="PUT",
+                    request_url=upload_url,
+                    request_headers=headers or None,
+                    request_data=f"[File data {len(data)} bytes]",
+                    error_message=f"{type(e).__name__}: {str(e)} (will retry)",
+                )
                 await sleep_with_interrupt(
                     delay,
                     cls,

From 117e2143543dd649d47345e183748a82d48d12d3 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Thu, 12 Feb 2026 16:51:50 -0800
Subject: [PATCH 17/25] ModelPatcherDynamic: force load non leaf weights
 (#12433)

The current behaviour of the default ModelPatcher is to .to a model
only if its fully loaded, which is how random non-leaf weights get
loaded in non-LowVRAM conditions.

The however means they never get loaded in dynamic_vram. In the
dynamic_vram case, force load them to the GPU.
---
 comfy/model_patcher.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index f278fccac..b1d907ba4 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -679,18 +679,19 @@ class ModelPatcher:
         for key in list(self.pinned):
             self.unpin_weight(key)
 
-    def _load_list(self, prio_comfy_cast_weights=False):
+    def _load_list(self, prio_comfy_cast_weights=False, default_device=None):
         loading = []
         for n, m in self.model.named_modules():
-            params = []
-            skip = False
-            for name, param in m.named_parameters(recurse=False):
-                params.append(name)
+            default = False
+            params = { name: param for name, param in m.named_parameters(recurse=False) }
             for name, param in m.named_parameters(recurse=True):
                 if name not in params:
-                    skip = True # skip random weights in non leaf modules
+                    default = True # default random weights in non leaf modules
                     break
-            if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
+            if default and default_device is not None:
+                for param in params.values():
+                    param.data = param.data.to(device=default_device)
+            if not default and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
                 module_mem = comfy.model_management.module_size(m)
                 module_offload_mem = module_mem
                 if hasattr(m, "comfy_cast_weights"):
@@ -1495,7 +1496,7 @@ class ModelPatcherDynamic(ModelPatcher):
             #with pin and unpin syncrhonization which can be expensive for small weights
             #with a high layer rate (e.g. autoregressive LLMs).
             #prioritize the non-comfy weights (note the order reverse).
-            loading = self._load_list(prio_comfy_cast_weights=True)
+            loading = self._load_list(prio_comfy_cast_weights=True, default_device=device_to)
             loading.sort(reverse=True)
 
             for x in loading:
@@ -1579,7 +1580,7 @@ class ModelPatcherDynamic(ModelPatcher):
         return 0 if vbar is None else vbar.free_memory(memory_to_free)
 
     def partially_unload_ram(self, ram_to_unload):
-        loading = self._load_list(prio_comfy_cast_weights=True)
+        loading = self._load_list(prio_comfy_cast_weights=True, default_device=self.offload_device)
         for x in loading:
             _, _, _, _, m, _ = x
             ram_to_unload -= comfy.pinned_memory.unpin_memory(m)

From ae79e33345dc893f8e0632c380c0e91dc09ac6e8 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Thu, 12 Feb 2026 16:56:42 -0800
Subject: [PATCH 18/25] llama: use a more efficient rope implementation
 (#12434)

Get rid of the cat and unary negation and inplace add-cmul the two
halves of the rope. Precompute -sin once at the start of the model
rather than every transformer block.

This is slightly faster on both GPU and CPU bound setups.
---
 comfy/text_encoders/llama.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index b6735d210..54f3d5595 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -355,13 +355,6 @@ class RMSNorm(nn.Module):
 
 
 
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
 def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
     if not isinstance(theta, list):
         theta = [theta]
@@ -390,20 +383,30 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di
         else:
             cos = cos.unsqueeze(1)
             sin = sin.unsqueeze(1)
-        out.append((cos, sin))
+        sin_split = sin.shape[-1] // 2
+        out.append((cos, sin[..., : sin_split], -sin[..., sin_split :]))
 
     if len(out) == 1:
         return out[0]
 
     return out
 
-
 def apply_rope(xq, xk, freqs_cis):
     org_dtype = xq.dtype
     cos = freqs_cis[0]
     sin = freqs_cis[1]
-    q_embed = (xq * cos) + (rotate_half(xq) * sin)
-    k_embed = (xk * cos) + (rotate_half(xk) * sin)
+    nsin = freqs_cis[2]
+
+    q_embed = (xq * cos)
+    q_split = q_embed.shape[-1] // 2
+    q_embed[..., : q_split].addcmul_(xq[..., q_split :], nsin)
+    q_embed[..., q_split :].addcmul_(xq[..., : q_split], sin)
+
+    k_embed = (xk * cos)
+    k_split = k_embed.shape[-1] // 2
+    k_embed[..., : k_split].addcmul_(xk[..., k_split :], nsin)
+    k_embed[..., k_split :].addcmul_(xk[..., : k_split], sin)
+
     return q_embed.to(org_dtype), k_embed.to(org_dtype)
 
 

From e03fe8b5919a23a473cea6e53f916f7403c082a5 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 12 Feb 2026 20:29:12 -0800
Subject: [PATCH 19/25] Update command to install AMD stable linux pytorch.
 (#12437)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 96dc2904b..3ccdc9c19 100644
--- a/README.md
+++ b/README.md
@@ -227,7 +227,7 @@ Put your VAE in: models/vae
 
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
 
-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1```
 
 This is the command to install the nightly with ROCm 7.1 which might have some performance improvements:
 

From 8902907d7ab949ce42dd9b658b4a4582ed9fb630 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:29:37 -0800
Subject: [PATCH 20/25] dynamic_vram: Training fixes (#12442)

---
 comfy/model_patcher.py      |  4 ++++
 comfy_extras/nodes_train.py | 11 ++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index b1d907ba4..67dce088e 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1561,6 +1561,8 @@ class ModelPatcherDynamic(ModelPatcher):
                         allocated_size += weight_size
                     vbar.set_watermark_limit(allocated_size)
 
+                move_weight_functions(m, device_to)
+
             logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.")
 
             self.model.device = device_to
@@ -1601,6 +1603,8 @@ class ModelPatcherDynamic(ModelPatcher):
         if unpatch_weights:
             self.partially_unload_ram(1e32)
             self.partially_unload(None, 1e32)
+            for m in self.model.modules():
+                move_weight_functions(m, device_to)
 
     def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
         assert not force_patch_weights #See above
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index 630eedc9f..aa2d88673 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -1035,7 +1035,7 @@ class TrainLoraNode(io.ComfyNode):
                 io.Boolean.Input(
                     "offloading",
                     default=False,
-                    tooltip="Depth level for gradient checkpointing.",
+                    tooltip="Offload the Model to RAM. Requires Bypass Mode.",
                 ),
                 io.Combo.Input(
                     "existing_lora",
@@ -1124,6 +1124,15 @@ class TrainLoraNode(io.ComfyNode):
         lora_dtype = node_helpers.string_to_torch_dtype(lora_dtype)
         mp.set_model_compute_dtype(dtype)
 
+        if mp.is_dynamic():
+            if not bypass_mode:
+                logging.info("Training MP is Dynamic - forcing bypass mode. Start comfy with --highvram to force weight diff mode")
+                bypass_mode = True
+            offloading = True
+        elif offloading:
+            if not bypass_mode:
+                logging.info("Training Offload selected - forcing bypass mode. Set bypass = True to remove this message")
+
         # Prepare latents and compute counts
         latents, num_images, multi_res = _prepare_latents_and_count(
             latents, dtype, bucket_mode

From e1add563f9e89026e8c4e8825a2b279fbd67d23a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:35:13 -0800
Subject: [PATCH 21/25] Use torch RMSNorm for flux models and refactor hunyuan
 video code. (#12432)

---
 comfy/controlnet.py                 |  1 +
 comfy/ldm/chroma/layers.py          |  3 +-
 comfy/ldm/chroma_radiance/layers.py |  8 ++---
 comfy/ldm/flux/layers.py            | 53 +++++++----------------------
 comfy/ldm/flux/model.py             |  3 +-
 comfy/ldm/hunyuan_video/model.py    | 11 +++---
 comfy/lora_convert.py               |  2 +-
 comfy/model_detection.py            | 18 +++++++---
 comfy/supported_models.py           | 32 +++++++++++++++--
 comfy/utils.py                      | 12 +++----
 10 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index 9e1e704e0..8336412f2 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -560,6 +560,7 @@ def load_controlnet_hunyuandit(controlnet_data, model_options={}):
 def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
     model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
     control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    sd = model_config.process_unet_state_dict(sd)
     control_model = controlnet_load_state_dict(control_model, sd)
     extra_conds = ['y', 'guidance']
     control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py
index 2d5684348..df348a8ed 100644
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@@ -3,7 +3,6 @@ from torch import Tensor, nn
 
 from comfy.ldm.flux.layers import (
     MLPEmbedder,
-    RMSNorm,
     ModulationOut,
 )
 
@@ -29,7 +28,7 @@ class Approximator(nn.Module):
         super().__init__()
         self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
         self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
+        self.norms = nn.ModuleList([operations.RMSNorm(hidden_dim, dtype=dtype, device=device) for x in range( n_layers)])
         self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)
 
     @property
diff --git a/comfy/ldm/chroma_radiance/layers.py b/comfy/ldm/chroma_radiance/layers.py
index 3c7bc9b6b..08d31e0ba 100644
--- a/comfy/ldm/chroma_radiance/layers.py
+++ b/comfy/ldm/chroma_radiance/layers.py
@@ -4,8 +4,6 @@ from functools import lru_cache
 import torch
 from torch import nn
 
-from comfy.ldm.flux.layers import RMSNorm
-
 
 class NerfEmbedder(nn.Module):
     """
@@ -145,7 +143,7 @@ class NerfGLUBlock(nn.Module):
         # We now need to generate parameters for 3 matrices.
         total_params = 3 * hidden_size_x**2 * mlp_ratio
         self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device)
-        self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations)
+        self.norm = operations.RMSNorm(hidden_size_x, dtype=dtype, device=device)
         self.mlp_ratio = mlp_ratio
 
 
@@ -178,7 +176,7 @@ class NerfGLUBlock(nn.Module):
 class NerfFinalLayer(nn.Module):
     def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
         super().__init__()
-        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
+        self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device)
         self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -190,7 +188,7 @@ class NerfFinalLayer(nn.Module):
 class NerfFinalLayerConv(nn.Module):
     def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None):
         super().__init__()
-        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
+        self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device)
         self.conv = operations.Conv2d(
             in_channels=hidden_size,
             out_channels=out_channels,
diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
index 60f2bdae2..1f2975fb1 100644
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -5,8 +5,6 @@ import torch
 from torch import Tensor, nn
 
 from .math import attention, rope
-import comfy.ops
-import comfy.ldm.common_dit
 
 
 class EmbedND(nn.Module):
@@ -87,20 +85,12 @@ def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dt
             operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
         )
 
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
-
-    def forward(self, x: Tensor):
-        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
-
 
 class QKNorm(torch.nn.Module):
     def __init__(self, dim: int, dtype=None, device=None, operations=None):
         super().__init__()
-        self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
-        self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
+        self.query_norm = operations.RMSNorm(dim, dtype=dtype, device=device)
+        self.key_norm = operations.RMSNorm(dim, dtype=dtype, device=device)
 
     def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
         q = self.query_norm(q)
@@ -169,7 +159,7 @@ class SiLUActivation(nn.Module):
 
 
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
@@ -197,8 +187,6 @@ class DoubleStreamBlock(nn.Module):
 
         self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
 
-        self.flipped_img_txt = flipped_img_txt
-
     def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
         if self.modulation:
             img_mod1, img_mod2 = self.img_mod(vec)
@@ -224,32 +212,17 @@ class DoubleStreamBlock(nn.Module):
         del txt_qkv
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
 
-        if self.flipped_img_txt:
-            q = torch.cat((img_q, txt_q), dim=2)
-            del img_q, txt_q
-            k = torch.cat((img_k, txt_k), dim=2)
-            del img_k, txt_k
-            v = torch.cat((img_v, txt_v), dim=2)
-            del img_v, txt_v
-            # run actual attention
-            attn = attention(q, k, v,
-                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
-            del q, k, v
+        q = torch.cat((txt_q, img_q), dim=2)
+        del txt_q, img_q
+        k = torch.cat((txt_k, img_k), dim=2)
+        del txt_k, img_k
+        v = torch.cat((txt_v, img_v), dim=2)
+        del txt_v, img_v
+        # run actual attention
+        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
+        del q, k, v
 
-            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
-        else:
-            q = torch.cat((txt_q, img_q), dim=2)
-            del txt_q, img_q
-            k = torch.cat((txt_k, img_k), dim=2)
-            del txt_k, img_k
-            v = torch.cat((txt_v, img_v), dim=2)
-            del txt_v, img_v
-            # run actual attention
-            attn = attention(q, k, v,
-                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
-            del q, k, v
-
-            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
 
         # calculate the img bloks
         img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py
index f40c2a7a9..260ccad7e 100644
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -16,7 +16,6 @@ from .layers import (
     SingleStreamBlock,
     timestep_embedding,
     Modulation,
-    RMSNorm
 )
 
 @dataclass
@@ -81,7 +80,7 @@ class Flux(nn.Module):
         self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
 
         if params.txt_norm:
-            self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
+            self.txt_norm = operations.RMSNorm(params.context_in_dim, dtype=dtype, device=device)
         else:
             self.txt_norm = None
 
diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py
index 55ab550f8..563f28f6b 100644
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -241,7 +241,6 @@ class HunyuanVideo(nn.Module):
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
-                    flipped_img_txt=True,
                     dtype=dtype, device=device, operations=operations
                 )
                 for _ in range(params.depth)
@@ -378,14 +377,14 @@ class HunyuanVideo(nn.Module):
             extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
             txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
 
-        ids = torch.cat((img_ids, txt_ids), dim=1)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
         pe = self.pe_embedder(ids)
 
         img_len = img.shape[1]
         if txt_mask is not None:
             attn_mask_len = img_len + txt.shape[1]
             attn_mask = torch.zeros((1, 1, attn_mask_len), dtype=img.dtype, device=img.device)
-            attn_mask[:, 0, img_len:] = txt_mask
+            attn_mask[:, 0, :txt.shape[1]] = txt_mask
         else:
             attn_mask = None
 
@@ -413,7 +412,7 @@ class HunyuanVideo(nn.Module):
                     if add is not None:
                         img += add
 
-        img = torch.cat((img, txt), 1)
+        img = torch.cat((txt, img), 1)
 
         transformer_options["total_blocks"] = len(self.single_blocks)
         transformer_options["block_type"] = "single"
@@ -435,9 +434,9 @@ class HunyuanVideo(nn.Module):
                 if i < len(control_o):
                     add = control_o[i]
                     if add is not None:
-                        img[:, : img_len] += add
+                        img[:, txt.shape[1]: img_len + txt.shape[1]] += add
 
-        img = img[:, : img_len]
+        img = img[:, txt.shape[1]: img_len + txt.shape[1]]
         if ref_latent is not None:
             img = img[:, ref_latent.shape[1]:]
 
diff --git a/comfy/lora_convert.py b/comfy/lora_convert.py
index 9d8d21efe..749e81df3 100644
--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@@ -5,7 +5,7 @@ import comfy.utils
 def convert_lora_bfl_control(sd): #BFL loras for Flux
     sd_out = {}
     for k in sd:
-        k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.scale.set_weight"))
+        k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.set_weight"))
         sd_out[k_to] = sd[k]
 
     sd_out["diffusion_model.img_in.reshape_weight"] = torch.tensor([sd["img_in.lora_B.weight"].shape[0], sd["img_in.lora_A.weight"].shape[1]])
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index e8ad725df..30ea03e8e 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -19,6 +19,12 @@ def count_blocks(state_dict_keys, prefix_string):
         count += 1
     return count
 
+def any_suffix_in(keys, prefix, main, suffix_list=[]):
+    for x in suffix_list:
+        if "{}{}{}".format(prefix, main, x) in keys:
+            return True
+    return False
+
 def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
     context_dim = None
     use_linear_in_transformer = False
@@ -186,7 +192,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["meanflow_sum"] = False
         return dit_config
 
-    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
+    if any_suffix_in(state_dict_keys, key_prefix, 'double_blocks.0.img_attn.norm.key_norm.', ["weight", "scale"]) and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"])): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
         dit_config = {}
         if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys:
             dit_config["image_model"] = "flux2"
@@ -241,7 +247,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
 
         dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
         dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
-        if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
+
+        if any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.0.norms.0.', ["weight", "scale"]) or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"]): #Chroma
             dit_config["image_model"] = "chroma"
             dit_config["in_channels"] = 64
             dit_config["out_channels"] = 64
@@ -249,7 +256,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["out_dim"] = 3072
             dit_config["hidden_dim"] = 5120
             dit_config["n_layers"] = 5
-            if f"{key_prefix}nerf_blocks.0.norm.scale" in state_dict_keys: #Chroma Radiance
+
+            if any_suffix_in(state_dict_keys, key_prefix, 'nerf_blocks.0.norm.', ["weight", "scale"]): #Chroma Radiance
                 dit_config["image_model"] = "chroma_radiance"
                 dit_config["in_channels"] = 3
                 dit_config["out_channels"] = 3
@@ -259,7 +267,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["nerf_depth"] = 4
                 dit_config["nerf_max_freqs"] = 8
                 dit_config["nerf_tile_size"] = 512
-                dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
+                dit_config["nerf_final_head_type"] = "conv" if any_suffix_in(state_dict_keys, key_prefix, 'nerf_final_layer_conv.norm.', ["weight", "scale"]) else "linear"
                 dit_config["nerf_embedder_dtype"] = torch.float32
                 if "{}__x0__".format(key_prefix) in state_dict_keys: # x0 pred
                     dit_config["use_x0"] = True
@@ -268,7 +276,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         else:
             dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
             dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
-            dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys
+            dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
             if dit_config["yak_mlp"] and dit_config["txt_norm"]:  # Ovis model
                 dit_config["txt_ids_dims"] = [1, 2]
 
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index d33db7507..c28be1716 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -710,6 +710,15 @@ class Flux(supported_models_base.BASE):
 
     supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
 
+    def process_unet_state_dict(self, state_dict):
+        out_sd = {}
+        for k in list(state_dict.keys()):
+            key_out = k
+            if key_out.endswith("_norm.scale"):
+                key_out = "{}.weight".format(key_out[:-len(".scale")])
+            out_sd[key_out] = state_dict[k]
+        return out_sd
+
     vae_key_prefix = ["vae."]
     text_encoder_key_prefix = ["text_encoders."]
 
@@ -898,11 +907,13 @@ class HunyuanVideo(supported_models_base.BASE):
             key_out = key_out.replace("txt_in.c_embedder.linear_1.", "txt_in.c_embedder.in_layer.").replace("txt_in.c_embedder.linear_2.", "txt_in.c_embedder.out_layer.")
             key_out = key_out.replace("_mod.linear.", "_mod.lin.").replace("_attn_qkv.", "_attn.qkv.")
             key_out = key_out.replace("mlp.fc1.", "mlp.0.").replace("mlp.fc2.", "mlp.2.")
-            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.scale").replace("_attn_k_norm.weight", "_attn.norm.key_norm.scale")
-            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.scale").replace(".k_norm.weight", ".norm.key_norm.scale")
+            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.weight").replace("_attn_k_norm.weight", "_attn.norm.key_norm.weight")
+            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.weight").replace(".k_norm.weight", ".norm.key_norm.weight")
             key_out = key_out.replace("_attn_proj.", "_attn.proj.")
             key_out = key_out.replace(".modulation.linear.", ".modulation.lin.")
             key_out = key_out.replace("_in.mlp.2.", "_in.out_layer.").replace("_in.mlp.0.", "_in.in_layer.")
+            if key_out.endswith(".scale"):
+                key_out = "{}.weight".format(key_out[:-len(".scale")])
             out_sd[key_out] = state_dict[k]
         return out_sd
 
@@ -1264,6 +1275,15 @@ class Hunyuan3Dv2(supported_models_base.BASE):
 
     latent_format = latent_formats.Hunyuan3Dv2
 
+    def process_unet_state_dict(self, state_dict):
+        out_sd = {}
+        for k in list(state_dict.keys()):
+            key_out = k
+            if key_out.endswith(".scale"):
+                key_out = "{}.weight".format(key_out[:-len(".scale")])
+            out_sd[key_out] = state_dict[k]
+        return out_sd
+
     def process_unet_state_dict_for_saving(self, state_dict):
         replace_prefix = {"": "model."}
         return utils.state_dict_prefix_replace(state_dict, replace_prefix)
@@ -1341,6 +1361,14 @@ class Chroma(supported_models_base.BASE):
 
     supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
 
+    def process_unet_state_dict(self, state_dict):
+        out_sd = {}
+        for k in list(state_dict.keys()):
+            key_out = k
+            if key_out.endswith(".scale"):
+                key_out = "{}.weight".format(key_out[:-len(".scale")])
+            out_sd[key_out] = state_dict[k]
+        return out_sd
 
     def get_model(self, state_dict, prefix="", device=None):
         out = model_base.Chroma(self, device=device)
diff --git a/comfy/utils.py b/comfy/utils.py
index e0a94e2e1..d553a7c1b 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -675,10 +675,10 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                         "ff_context.linear_in.bias": "txt_mlp.0.bias",
                         "ff_context.linear_out.weight": "txt_mlp.2.weight",
                         "ff_context.linear_out.bias": "txt_mlp.2.bias",
-                        "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
-                        "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
-                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
-                        "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
+                        "attn.norm_q.weight": "img_attn.norm.query_norm.weight",
+                        "attn.norm_k.weight": "img_attn.norm.key_norm.weight",
+                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.weight",
+                        "attn.norm_added_k.weight": "txt_attn.norm.key_norm.weight",
                     }
 
         for k in block_map:
@@ -701,8 +701,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                         "norm.linear.bias": "modulation.lin.bias",
                         "proj_out.weight": "linear2.weight",
                         "proj_out.bias": "linear2.bias",
-                        "attn.norm_q.weight": "norm.query_norm.scale",
-                        "attn.norm_k.weight": "norm.key_norm.scale",
+                        "attn.norm_q.weight": "norm.query_norm.weight",
+                        "attn.norm_k.weight": "norm.key_norm.weight",
                         "attn.to_qkv_mlp_proj.weight": "linear1.weight", # Flux 2
                         "attn.to_out.weight": "linear2.weight", # Flux 2
                     }

From 831351a29e91ea758437227c2f3c915a6be6d1a6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 13 Feb 2026 17:15:23 -0800
Subject: [PATCH 22/25] Support generating attention masks for left padded text
 encoders. (#12454)

---
 comfy/sd1_clip.py            | 15 +++++++++++----
 comfy/text_encoders/ace15.py |  8 +-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 4c817d468..b564d1529 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -171,8 +171,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
 
     def process_tokens(self, tokens, device):
         end_token = self.special_tokens.get("end", None)
+        pad_token = self.special_tokens.get("pad", -1)
         if end_token is None:
-            cmp_token = self.special_tokens.get("pad", -1)
+            cmp_token = pad_token
         else:
             cmp_token = end_token
 
@@ -186,15 +187,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
             other_embeds = []
             eos = False
             index = 0
+            left_pad = False
             for y in x:
                 if isinstance(y, numbers.Integral):
-                    if eos:
+                    token = int(y)
+                    if index == 0 and token == pad_token:
+                        left_pad = True
+
+                    if eos or (left_pad and token == pad_token):
                         attention_mask.append(0)
                     else:
                         attention_mask.append(1)
-                    token = int(y)
+                        left_pad = False
+
                     tokens_temp += [token]
-                    if not eos and token == cmp_token:
+                    if not eos and token == cmp_token and not left_pad:
                         if end_token is None:
                             attention_mask[-1] = 0
                         eos = True
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 0fdd4669f..f135d74c1 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -10,7 +10,6 @@ import comfy.utils
 def sample_manual_loop_no_classes(
     model,
     ids=None,
-    paddings=[],
     execution_dtype=None,
     cfg_scale: float = 2.0,
     temperature: float = 0.85,
@@ -36,9 +35,6 @@ def sample_manual_loop_no_classes(
 
     embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
     embeds_batch = embeds.shape[0]
-    for i, t in enumerate(paddings):
-        attention_mask[i, :t] = 0
-        attention_mask[i, t:] = 1
 
     output_audio_codes = []
     past_key_values = []
@@ -135,13 +131,11 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
             pos_pad = (len(negative) - len(positive))
             positive = [model.special_tokens["pad"]] * pos_pad + positive
 
-        paddings = [pos_pad, neg_pad]
         ids = [positive, negative]
     else:
-        paddings = []
         ids = [positive]
 
-    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+    return sample_manual_loop_no_classes(model, ids, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
 
 
 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):

From 726af73867c18c5ca8b980a2c28401d77e5b365a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 13 Feb 2026 17:21:10 -0800
Subject: [PATCH 23/25] Fix some custom nodes. (#12455)

---
 comfy/ldm/flux/layers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
index 1f2975fb1..3518a1922 100644
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -6,6 +6,8 @@ from torch import Tensor, nn
 
 from .math import attention, rope
 
+# Fix import for some custom nodes, TODO: delete eventually.
+RMSNorm = None
 
 class EmbedND(nn.Module):
     def __init__(self, dim: int, theta: int, axes_dim: list):

From 712efb466b9379e6761802c44027783d37d96a87 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 13 Feb 2026 18:56:54 -0800
Subject: [PATCH 24/25] Add left padding to LTXAV text encoder. (#12456)

---
 comfy/text_encoders/lt.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py
index 3f87dfd6a..9cf87c0b2 100644
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs):
 class Gemma3_12BTokenizer(sd1_clip.SDTokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
         tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_left=True, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
 
     def state_dict(self):
         return {"spiece_model": self.tokenizer.serialize_model()}
@@ -97,6 +97,7 @@ class LTXAVTEModel(torch.nn.Module):
         token_weight_pairs = token_weight_pairs["gemma3_12b"]
 
         out, pooled, extra = self.gemma3_12b.encode_token_weights(token_weight_pairs)
+        out = out[:, :, -torch.sum(extra["attention_mask"]).item():]
         out_device = out.device
         if comfy.model_management.should_use_bf16(self.execution_device):
             out = out.to(device=self.execution_device, dtype=torch.bfloat16)
@@ -138,6 +139,7 @@ class LTXAVTEModel(torch.nn.Module):
 
         token_weight_pairs = token_weight_pairs.get("gemma3_12b", [])
         num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
+        num_tokens = max(num_tokens, 64)
         return num_tokens * constant * 1024 * 1024
 
 def ltxav_te(dtype_llama=None, llama_quantization_metadata=None):

From dc9822b7df4785e77690e93b0e09feaff01e2e12 Mon Sep 17 00:00:00 2001
From: krigeta <75309361+krigeta@users.noreply.github.com>
Date: Sat, 14 Feb 2026 08:53:52 +0530
Subject: [PATCH 25/25] Add working Qwen 2512 ControlNet (Fun ControlNet)
 support (#12359)

---
 comfy/controlnet.py                |  73 +++++++++++
 comfy/ldm/qwen_image/controlnet.py | 190 +++++++++++++++++++++++++++++
 2 files changed, 263 insertions(+)

diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index 8336412f2..ba670b16d 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -297,6 +297,30 @@ class ControlNet(ControlBase):
         self.model_sampling_current = None
         super().cleanup()
 
+
+class QwenFunControlNet(ControlNet):
+    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+        # Fun checkpoints are more sensitive to high strengths in the generic
+        # ControlNet merge path. Use a soft response curve so strength=1.0 stays
+        # unchanged while >1 grows more gently.
+        original_strength = self.strength
+        self.strength = math.sqrt(max(self.strength, 0.0))
+        try:
+            return super().get_control(x_noisy, t, cond, batched_number, transformer_options)
+        finally:
+            self.strength = original_strength
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        self.set_extra_arg("base_model", model.diffusion_model)
+
+    def copy(self):
+        c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
 class ControlLoraOps:
     class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
         def __init__(self, in_features: int, out_features: int, bias: bool = True,
@@ -606,6 +630,53 @@ def load_controlnet_qwen_instantx(sd, model_options={}):
     control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
     return control
 
+
+def load_controlnet_qwen_fun(sd, model_options={}):
+    load_device = comfy.model_management.get_torch_device()
+    weight_dtype = comfy.utils.weight_dtype(sd)
+    unet_dtype = model_options.get("dtype", weight_dtype)
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    in_features = sd["control_img_in.weight"].shape[1]
+    inner_dim = sd["control_img_in.weight"].shape[0]
+
+    block_weight = sd["control_blocks.0.attn.to_q.weight"]
+    attention_head_dim = sd["control_blocks.0.attn.norm_q.weight"].shape[0]
+    num_attention_heads = max(1, block_weight.shape[0] // max(1, attention_head_dim))
+
+    model = comfy.ldm.qwen_image.controlnet.QwenImageFunControlNetModel(
+        control_in_features=in_features,
+        inner_dim=inner_dim,
+        num_attention_heads=num_attention_heads,
+        attention_head_dim=attention_head_dim,
+        num_control_blocks=5,
+        main_model_double=60,
+        injection_layers=(0, 12, 24, 36, 48),
+        operations=operations,
+        device=comfy.model_management.unet_offload_device(),
+        dtype=unet_dtype,
+    )
+    model = controlnet_load_state_dict(model, sd)
+
+    latent_format = comfy.latent_formats.Wan21()
+    control = QwenFunControlNet(
+        model,
+        compression_ratio=1,
+        latent_format=latent_format,
+        # Fun checkpoints already expect their own 33-channel context handling.
+        # Enabling generic concat_mask injects an extra mask channel at apply-time
+        # and breaks the intended fallback packing path.
+        concat_mask=False,
+        load_device=load_device,
+        manual_cast_dtype=manual_cast_dtype,
+        extra_conds=[],
+    )
+    return control
+
 def convert_mistoline(sd):
     return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
 
@@ -683,6 +754,8 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
             return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options)
         elif "controlnet_x_embedder.weight" in controlnet_data:
             return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
+    elif "control_blocks.0.after_proj.weight" in controlnet_data and "control_img_in.weight" in controlnet_data:
+        return load_controlnet_qwen_fun(controlnet_data, model_options=model_options)
 
     elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
         return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py
index a6d408104..c0aae9240 100644
--- a/comfy/ldm/qwen_image/controlnet.py
+++ b/comfy/ldm/qwen_image/controlnet.py
@@ -2,6 +2,196 @@ import torch
 import math
 
 from .model import QwenImageTransformer2DModel
+from .model import QwenImageTransformerBlock
+
+
+class QwenImageFunControlBlock(QwenImageTransformerBlock):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, has_before_proj=False, dtype=None, device=None, operations=None):
+        super().__init__(
+            dim=dim,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.has_before_proj = has_before_proj
+        if has_before_proj:
+            self.before_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+        self.after_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+
+
+class QwenImageFunControlNetModel(torch.nn.Module):
+    def __init__(
+        self,
+        control_in_features=132,
+        inner_dim=3072,
+        num_attention_heads=24,
+        attention_head_dim=128,
+        num_control_blocks=5,
+        main_model_double=60,
+        injection_layers=(0, 12, 24, 36, 48),
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.main_model_double = main_model_double
+        self.injection_layers = tuple(injection_layers)
+        # Keep base hint scaling at 1.0 so user-facing strength behaves similarly
+        # to the reference Gen2/VideoX implementation around strength=1.
+        self.hint_scale = 1.0
+        self.control_img_in = operations.Linear(control_in_features, inner_dim, device=device, dtype=dtype)
+
+        self.control_blocks = torch.nn.ModuleList([])
+        for i in range(num_control_blocks):
+            self.control_blocks.append(
+                QwenImageFunControlBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    has_before_proj=(i == 0),
+                    dtype=dtype,
+                    device=device,
+                    operations=operations,
+                )
+            )
+
+    def _process_hint_tokens(self, hint):
+        if hint is None:
+            return None
+        if hint.ndim == 4:
+            hint = hint.unsqueeze(2)
+
+        # Fun checkpoints are trained with 33 latent channels before 2x2 packing:
+        # [control_latent(16), mask(1), inpaint_latent(16)] -> 132 features.
+        # Default behavior (no inpaint input in stock Apply ControlNet) should use
+        # zeros for mask/inpaint branches, matching VideoX fallback semantics.
+        expected_c = self.control_img_in.weight.shape[1] // 4
+        if hint.shape[1] == 16 and expected_c == 33:
+            zeros_mask = torch.zeros_like(hint[:, :1])
+            zeros_inpaint = torch.zeros_like(hint)
+            hint = torch.cat([hint, zeros_mask, zeros_inpaint], dim=1)
+
+        bs, c, t, h, w = hint.shape
+        hidden_states = torch.nn.functional.pad(hint, (0, w % 2, 0, h % 2))
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(
+            orig_shape[0],
+            orig_shape[1],
+            orig_shape[-3],
+            orig_shape[-2] // 2,
+            2,
+            orig_shape[-1] // 2,
+            2,
+        )
+        hidden_states = hidden_states.permute(0, 2, 3, 5, 1, 4, 6)
+        hidden_states = hidden_states.reshape(
+            bs,
+            t * ((h + 1) // 2) * ((w + 1) // 2),
+            c * 4,
+        )
+
+        expected_in = self.control_img_in.weight.shape[1]
+        cur_in = hidden_states.shape[-1]
+        if cur_in < expected_in:
+            pad = torch.zeros(
+                (hidden_states.shape[0], hidden_states.shape[1], expected_in - cur_in),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+            hidden_states = torch.cat([hidden_states, pad], dim=-1)
+        elif cur_in > expected_in:
+            hidden_states = hidden_states[:, :, :expected_in]
+
+        return hidden_states
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        attention_mask=None,
+        guidance: torch.Tensor = None,
+        hint=None,
+        transformer_options={},
+        base_model=None,
+        **kwargs,
+    ):
+        if base_model is None:
+            raise RuntimeError("Qwen Fun ControlNet requires a QwenImage base model at runtime.")
+
+        encoder_hidden_states_mask = attention_mask
+        # Keep attention mask disabled inside Fun control blocks to mirror
+        # VideoX behavior (they rely on seq lengths for RoPE, not masked attention).
+        encoder_hidden_states_mask = None
+
+        hidden_states, img_ids, _ = base_model.process_img(x)
+        hint_tokens = self._process_hint_tokens(hint)
+        if hint_tokens is None:
+            raise RuntimeError("Qwen Fun ControlNet requires a control hint image.")
+
+        if hint_tokens.shape[1] != hidden_states.shape[1]:
+            max_tokens = min(hint_tokens.shape[1], hidden_states.shape[1])
+            hint_tokens = hint_tokens[:, :max_tokens]
+            hidden_states = hidden_states[:, :max_tokens]
+            img_ids = img_ids[:, :max_tokens]
+
+        txt_start = round(
+            max(
+                ((x.shape[-1] + (base_model.patch_size // 2)) // base_model.patch_size) // 2,
+                ((x.shape[-2] + (base_model.patch_size // 2)) // base_model.patch_size) // 2,
+            )
+        )
+        txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        image_rotary_emb = base_model.pe_embedder(ids).to(x.dtype).contiguous()
+
+        hidden_states = base_model.img_in(hidden_states)
+        encoder_hidden_states = base_model.txt_norm(context)
+        encoder_hidden_states = base_model.txt_in(encoder_hidden_states)
+
+        if guidance is not None:
+            guidance = guidance * 1000
+
+        temb = (
+            base_model.time_text_embed(timesteps, hidden_states)
+            if guidance is None
+            else base_model.time_text_embed(timesteps, guidance, hidden_states)
+        )
+
+        c = self.control_img_in(hint_tokens)
+
+        for i, block in enumerate(self.control_blocks):
+            if i == 0:
+                c_in = block.before_proj(c) + hidden_states
+                all_c = []
+            else:
+                all_c = list(torch.unbind(c, dim=0))
+                c_in = all_c.pop(-1)
+
+            encoder_hidden_states, c_out = block(
+                hidden_states=c_in,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_hidden_states_mask=encoder_hidden_states_mask,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                transformer_options=transformer_options,
+            )
+
+            c_skip = block.after_proj(c_out) * self.hint_scale
+            all_c += [c_skip, c_out]
+            c = torch.stack(all_c, dim=0)
+
+        hints = torch.unbind(c, dim=0)[:-1]
+
+        controlnet_block_samples = [None] * self.main_model_double
+        for local_idx, base_idx in enumerate(self.injection_layers):
+            if local_idx < len(hints) and base_idx < len(controlnet_block_samples):
+                controlnet_block_samples[base_idx] = hints[local_idx]
+
+        return {"input": controlnet_block_samples}
 
 
 class QwenImageControlNetModel(QwenImageTransformer2DModel):