From fe053ba5eb34c8abcc5d17a25c114340af1833aa Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:37:17 -0800 Subject: [PATCH 01/25] mp: dont deep-clone objects from model_options (#12382) If there are non-trivial python objects nested in the model_options, this causes all sorts of issues. Traverse lists and dicts so clones can safely overide settings and BYO objects but stop there on the deepclone. --- comfy/model_patcher.py | 3 +-- comfy/utils.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index b9a117a7c..19c9031ea 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -19,7 +19,6 @@ from __future__ import annotations import collections -import copy import inspect import logging import math @@ -317,7 +316,7 @@ class ModelPatcher: n.object_patches = self.object_patches.copy() n.weight_wrapper_patches = self.weight_wrapper_patches.copy() - n.model_options = copy.deepcopy(self.model_options) + n.model_options = comfy.utils.deepcopy_list_dict(self.model_options) n.backup = self.backup n.object_patches_backup = self.object_patches_backup n.parent = self diff --git a/comfy/utils.py b/comfy/utils.py index 1337e2205..edd80cebe 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -1376,3 +1376,21 @@ def string_to_seed(data): else: crc >>= 1 return crc ^ 0xFFFFFFFF + +def deepcopy_list_dict(obj, memo=None): + if memo is None: + memo = {} + + obj_id = id(obj) + if obj_id in memo: + return memo[obj_id] + + if isinstance(obj, dict): + res = {deepcopy_list_dict(k, memo): deepcopy_list_dict(v, memo) for k, v in obj.items()} + elif isinstance(obj, list): + res = [deepcopy_list_dict(i, memo) for i in obj] + else: + res = obj + + memo[obj_id] = res + return res From f719f9c06266e7944683009b403e995d4c61d5f0 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:37:46 -0800 Subject: [PATCH 02/25] sd: delay VAE dtype archive until after override (#12388) VAEs have host specific dtype logic that should override the dynamic _model_dtype. Defer the archiving of model dtypes until after. --- comfy/sd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index bc9407405..f65e7cadd 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -793,8 +793,6 @@ class VAE: self.first_stage_model = AutoencoderKL(**(config['params'])) self.first_stage_model = self.first_stage_model.eval() - model_management.archive_model_dtypes(self.first_stage_model) - if device is None: device = model_management.vae_device() self.device = device @@ -803,6 +801,7 @@ class VAE: dtype = model_management.vae_dtype(self.device, self.working_dtypes) self.vae_dtype = dtype self.first_stage_model.to(self.vae_dtype) + model_management.archive_model_dtypes(self.first_stage_model) self.output_device = model_management.intermediate_device() mp = comfy.model_patcher.CoreModelPatcher From 123a7874a97c4a8b8f06d4b7c2b1a566b8f0d057 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:38:28 -0800 Subject: [PATCH 03/25] ops: Fix vanilla-fp8 loaded lora quality (#12390) This was missing the stochastic rounding required for fp8 downcast to be consistent with model_patcher.patch_weight_to_device. Missed in testing as I spend too much time with quantized tensors and overlooked the simpler ones. --- comfy/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index ea0d70702..33803b223 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -169,8 +169,8 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu if orig.dtype == dtype and len(fns) == 0: #The layer actually wants our freshly saved QT x = y - else: - y = x + elif update_weight: + y = comfy.float.stochastic_rounding(x, orig.dtype, seed = comfy.utils.string_to_seed(s.seed_key)) if update_weight: orig.copy_(y) for f in fns: From 00fff6019ecf0f4306005579e93cef0cd51a3a1c Mon Sep 17 00:00:00 2001 From: guill Date: Tue, 10 Feb 2026 14:37:14 -0800 Subject: [PATCH 04/25] feat(jobs): add 3d to PREVIEWABLE_MEDIA_TYPES for first-class 3D output support (#12381) Co-authored-by: Jedrzej Kosinski --- comfy_execution/jobs.py | 79 +++++++++++-- tests/execution/test_jobs.py | 208 ++++++++++++++++++++++++++++++++++- 2 files changed, 271 insertions(+), 16 deletions(-) diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py index bf091a448..370014fb6 100644 --- a/comfy_execution/jobs.py +++ b/comfy_execution/jobs.py @@ -20,10 +20,60 @@ class JobStatus: # Media types that can be previewed in the frontend -PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio'}) +PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio', '3d'}) # 3D file extensions for preview fallback (no dedicated media_type exists) -THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb'}) +THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb', '.usdz'}) + + +def has_3d_extension(filename: str) -> bool: + lower = filename.lower() + return any(lower.endswith(ext) for ext in THREE_D_EXTENSIONS) + + +def normalize_output_item(item): + """Normalize a single output list item for the jobs API. + + Returns the normalized item, or None to exclude it. + String items with 3D extensions become {filename, type, subfolder} dicts. + """ + if item is None: + return None + if isinstance(item, str): + if has_3d_extension(item): + return {'filename': item, 'type': 'output', 'subfolder': '', 'mediaType': '3d'} + return None + if isinstance(item, dict): + return item + return None + + +def normalize_outputs(outputs: dict) -> dict: + """Normalize raw node outputs for the jobs API. + + Transforms string 3D filenames into file output dicts and removes + None items. All other items (non-3D strings, dicts, etc.) are + preserved as-is. + """ + normalized = {} + for node_id, node_outputs in outputs.items(): + if not isinstance(node_outputs, dict): + normalized[node_id] = node_outputs + continue + normalized_node = {} + for media_type, items in node_outputs.items(): + if media_type == 'animated' or not isinstance(items, list): + normalized_node[media_type] = items + continue + normalized_items = [] + for item in items: + if item is None: + continue + norm = normalize_output_item(item) + normalized_items.append(norm if norm is not None else item) + normalized_node[media_type] = normalized_items + normalized[node_id] = normalized_node + return normalized def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]: @@ -45,9 +95,9 @@ def is_previewable(media_type: str, item: dict) -> bool: Maintains backwards compatibility with existing logic. Priority: - 1. media_type is 'images', 'video', or 'audio' + 1. media_type is 'images', 'video', 'audio', or '3d' 2. format field starts with 'video/' or 'audio/' - 3. filename has a 3D extension (.obj, .fbx, .gltf, .glb) + 3. filename has a 3D extension (.obj, .fbx, .gltf, .glb, .usdz) """ if media_type in PREVIEWABLE_MEDIA_TYPES: return True @@ -139,7 +189,7 @@ def normalize_history_item(prompt_id: str, history_item: dict, include_outputs: }) if include_outputs: - job['outputs'] = outputs + job['outputs'] = normalize_outputs(outputs) job['execution_status'] = status_info job['workflow'] = { 'prompt': prompt, @@ -171,18 +221,23 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]: continue for item in items: - count += 1 - - if not isinstance(item, dict): + normalized = normalize_output_item(item) + if normalized is None: continue - if preview_output is None and is_previewable(media_type, item): + count += 1 + + if preview_output is not None: + continue + + if isinstance(normalized, dict) and is_previewable(media_type, normalized): enriched = { - **item, + **normalized, 'nodeId': node_id, - 'mediaType': media_type } - if item.get('type') == 'output': + if 'mediaType' not in normalized: + enriched['mediaType'] = media_type + if normalized.get('type') == 'output': preview_output = enriched elif fallback_preview is None: fallback_preview = enriched diff --git a/tests/execution/test_jobs.py b/tests/execution/test_jobs.py index 4d2f9ed36..83c36fe48 100644 --- a/tests/execution/test_jobs.py +++ b/tests/execution/test_jobs.py @@ -5,8 +5,11 @@ from comfy_execution.jobs import ( is_previewable, normalize_queue_item, normalize_history_item, + normalize_output_item, + normalize_outputs, get_outputs_summary, apply_sorting, + has_3d_extension, ) @@ -35,8 +38,8 @@ class TestIsPreviewable: """Unit tests for is_previewable()""" def test_previewable_media_types(self): - """Images, video, audio media types should be previewable.""" - for media_type in ['images', 'video', 'audio']: + """Images, video, audio, 3d media types should be previewable.""" + for media_type in ['images', 'video', 'audio', '3d']: assert is_previewable(media_type, {}) is True def test_non_previewable_media_types(self): @@ -46,7 +49,7 @@ class TestIsPreviewable: def test_3d_extensions_previewable(self): """3D file extensions should be previewable regardless of media_type.""" - for ext in ['.obj', '.fbx', '.gltf', '.glb']: + for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']: item = {'filename': f'model{ext}'} assert is_previewable('files', item) is True @@ -160,7 +163,7 @@ class TestGetOutputsSummary: def test_3d_files_previewable(self): """3D file extensions should be previewable.""" - for ext in ['.obj', '.fbx', '.gltf', '.glb']: + for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']: outputs = { 'node1': { 'files': [{'filename': f'model{ext}', 'type': 'output'}] @@ -192,6 +195,64 @@ class TestGetOutputsSummary: assert preview['mediaType'] == 'images' assert preview['subfolder'] == 'outputs' + def test_string_3d_filename_creates_preview(self): + """String items with 3D extensions should synthesize a preview (Preview3D node output). + Only the .glb counts — nulls and non-file strings are excluded.""" + outputs = { + 'node1': { + 'result': ['preview3d_abc123.glb', None, None] + } + } + count, preview = get_outputs_summary(outputs) + assert count == 1 + assert preview is not None + assert preview['filename'] == 'preview3d_abc123.glb' + assert preview['mediaType'] == '3d' + assert preview['nodeId'] == 'node1' + assert preview['type'] == 'output' + + def test_string_non_3d_filename_no_preview(self): + """String items without 3D extensions should not create a preview.""" + outputs = { + 'node1': { + 'result': ['data.json', None] + } + } + count, preview = get_outputs_summary(outputs) + assert count == 0 + assert preview is None + + def test_string_3d_filename_used_as_fallback(self): + """String 3D preview should be used when no dict items are previewable.""" + outputs = { + 'node1': { + 'latents': [{'filename': 'latent.safetensors'}], + }, + 'node2': { + 'result': ['model.glb', None] + } + } + count, preview = get_outputs_summary(outputs) + assert preview is not None + assert preview['filename'] == 'model.glb' + assert preview['mediaType'] == '3d' + + +class TestHas3DExtension: + """Unit tests for has_3d_extension()""" + + def test_recognized_extensions(self): + for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']: + assert has_3d_extension(f'model{ext}') is True + + def test_case_insensitive(self): + assert has_3d_extension('MODEL.GLB') is True + assert has_3d_extension('Scene.GLTF') is True + + def test_non_3d_extensions(self): + for name in ['photo.png', 'video.mp4', 'data.json', 'model']: + assert has_3d_extension(name) is False + class TestApplySorting: """Unit tests for apply_sorting()""" @@ -395,3 +456,142 @@ class TestNormalizeHistoryItem: 'prompt': {'nodes': {'1': {}}}, 'extra_data': {'create_time': 1234567890, 'client_id': 'abc'}, } + + def test_include_outputs_normalizes_3d_strings(self): + """Detail view should transform string 3D filenames into file output dicts.""" + history_item = { + 'prompt': ( + 5, + 'prompt-3d', + {'nodes': {}}, + {'create_time': 1234567890}, + ['node1'], + ), + 'status': {'status_str': 'success', 'completed': True, 'messages': []}, + 'outputs': { + 'node1': { + 'result': ['preview3d_abc123.glb', None, None] + } + }, + } + job = normalize_history_item('prompt-3d', history_item, include_outputs=True) + + assert job['outputs_count'] == 1 + result_items = job['outputs']['node1']['result'] + assert len(result_items) == 1 + assert result_items[0] == { + 'filename': 'preview3d_abc123.glb', + 'type': 'output', + 'subfolder': '', + 'mediaType': '3d', + } + + def test_include_outputs_preserves_dict_items(self): + """Detail view normalization should pass dict items through unchanged.""" + history_item = { + 'prompt': ( + 5, + 'prompt-img', + {'nodes': {}}, + {'create_time': 1234567890}, + ['node1'], + ), + 'status': {'status_str': 'success', 'completed': True, 'messages': []}, + 'outputs': { + 'node1': { + 'images': [ + {'filename': 'photo.png', 'type': 'output', 'subfolder': ''}, + ] + } + }, + } + job = normalize_history_item('prompt-img', history_item, include_outputs=True) + + assert job['outputs_count'] == 1 + assert job['outputs']['node1']['images'] == [ + {'filename': 'photo.png', 'type': 'output', 'subfolder': ''}, + ] + + +class TestNormalizeOutputItem: + """Unit tests for normalize_output_item()""" + + def test_none_returns_none(self): + assert normalize_output_item(None) is None + + def test_string_3d_extension_synthesizes_dict(self): + result = normalize_output_item('model.glb') + assert result == {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'} + + def test_string_non_3d_extension_returns_none(self): + assert normalize_output_item('data.json') is None + + def test_string_no_extension_returns_none(self): + assert normalize_output_item('camera_info_string') is None + + def test_dict_passes_through(self): + item = {'filename': 'test.png', 'type': 'output'} + assert normalize_output_item(item) is item + + def test_other_types_return_none(self): + assert normalize_output_item(42) is None + assert normalize_output_item(True) is None + + +class TestNormalizeOutputs: + """Unit tests for normalize_outputs()""" + + def test_empty_outputs(self): + assert normalize_outputs({}) == {} + + def test_dict_items_pass_through(self): + outputs = { + 'node1': { + 'images': [{'filename': 'a.png', 'type': 'output'}], + } + } + result = normalize_outputs(outputs) + assert result == outputs + + def test_3d_string_synthesized(self): + outputs = { + 'node1': { + 'result': ['model.glb', None, None], + } + } + result = normalize_outputs(outputs) + assert result == { + 'node1': { + 'result': [ + {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'}, + ], + } + } + + def test_animated_key_preserved(self): + outputs = { + 'node1': { + 'images': [{'filename': 'a.png', 'type': 'output'}], + 'animated': [True], + } + } + result = normalize_outputs(outputs) + assert result['node1']['animated'] == [True] + + def test_non_dict_node_outputs_preserved(self): + outputs = {'node1': 'unexpected_value'} + result = normalize_outputs(outputs) + assert result == {'node1': 'unexpected_value'} + + def test_none_items_filtered_but_other_types_preserved(self): + outputs = { + 'node1': { + 'result': ['data.json', None, [1, 2, 3]], + } + } + result = normalize_outputs(outputs) + assert result == { + 'node1': { + 'result': ['data.json', [1, 2, 3]], + } + } From dbe70b6821994ce92d9cf211cc685862d0b6c0ca Mon Sep 17 00:00:00 2001 From: AustinMroz Date: Tue, 10 Feb 2026 14:42:21 -0800 Subject: [PATCH 05/25] Add a VideoSlice node (#12107) * Base TrimVideo implementation * Raise error if as_trimmed call fails * Bigger max start_time, tooltips, and formatting * Count packets unless codec has subframes * Remove incorrect nested decode * Add null check for audio streams * Support non-strict duration * Added strict_duration bool to node definition * Empty commit for approval * Fix duration * Support 5.1 audio layout on save --------- Co-authored-by: Jedrzej Kosinski --- comfy_api/latest/_input/video_types.py | 15 ++ comfy_api/latest/_input_impl/video_types.py | 201 ++++++++++++++------ comfy_extras/nodes_video.py | 51 +++++ 3 files changed, 207 insertions(+), 60 deletions(-) diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py index e634a0311..451e9526e 100644 --- a/comfy_api/latest/_input/video_types.py +++ b/comfy_api/latest/_input/video_types.py @@ -34,6 +34,21 @@ class VideoInput(ABC): """ pass + @abstractmethod + def as_trimmed( + self, + start_time: float | None = None, + duration: float | None = None, + strict_duration: bool = False, + ) -> VideoInput | None: + """ + Create a new VideoInput which is trimmed to have the corresponding start_time and duration + + Returns: + A new VideoInput, or None if the result would have negative duration + """ + pass + def get_stream_source(self) -> Union[str, io.BytesIO]: """ Get a streamable source for the video. This allows processing without diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 1405d0b81..3463ed1c9 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -6,6 +6,7 @@ from typing import Optional from .._input import AudioInput, VideoInput import av import io +import itertools import json import numpy as np import math @@ -29,7 +30,6 @@ def container_to_output_format(container_format: str | None) -> str | None: formats = container_format.split(",") return formats[0] - def get_open_write_kwargs( dest: str | io.BytesIO, container_format: str, to_format: str | None ) -> dict: @@ -57,12 +57,14 @@ class VideoFromFile(VideoInput): Class representing video input from a file. """ - def __init__(self, file: str | io.BytesIO): + def __init__(self, file: str | io.BytesIO, *, start_time: float=0, duration: float=0): """ Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object containing the file contents. """ self.__file = file + self.__start_time = start_time + self.__duration = duration def get_stream_source(self) -> str | io.BytesIO: """ @@ -96,6 +98,16 @@ class VideoFromFile(VideoInput): Returns: Duration in seconds """ + raw_duration = self._get_raw_duration() + if self.__start_time < 0: + duration_from_start = min(raw_duration, -self.__start_time) + else: + duration_from_start = raw_duration - self.__start_time + if self.__duration: + return min(self.__duration, duration_from_start) + return duration_from_start + + def _get_raw_duration(self) -> float: if isinstance(self.__file, io.BytesIO): self.__file.seek(0) with av.open(self.__file, mode="r") as container: @@ -113,9 +125,13 @@ class VideoFromFile(VideoInput): if video_stream and video_stream.average_rate: frame_count = 0 container.seek(0) - for packet in container.demux(video_stream): - for _ in packet.decode(): - frame_count += 1 + frame_iterator = ( + container.decode(video_stream) + if video_stream.codec.capabilities & 0x100 + else container.demux(video_stream) + ) + for packet in frame_iterator: + frame_count += 1 if frame_count > 0: return float(frame_count / video_stream.average_rate) @@ -131,36 +147,54 @@ class VideoFromFile(VideoInput): with av.open(self.__file, mode="r") as container: video_stream = self._get_first_video_stream(container) - # 1. Prefer the frames field if available - if video_stream.frames and video_stream.frames > 0: + # 1. Prefer the frames field if available and usable + if ( + video_stream.frames + and video_stream.frames > 0 + and not self.__start_time + and not self.__duration + ): return int(video_stream.frames) # 2. Try to estimate from duration and average_rate using only metadata - if container.duration is not None and video_stream.average_rate: - duration_seconds = float(container.duration / av.time_base) - estimated_frames = int(round(duration_seconds * float(video_stream.average_rate))) - if estimated_frames > 0: - return estimated_frames - if ( getattr(video_stream, "duration", None) is not None and getattr(video_stream, "time_base", None) is not None and video_stream.average_rate ): - duration_seconds = float(video_stream.duration * video_stream.time_base) + raw_duration = float(video_stream.duration * video_stream.time_base) + if self.__start_time < 0: + duration_from_start = min(raw_duration, -self.__start_time) + else: + duration_from_start = raw_duration - self.__start_time + duration_seconds = min(self.__duration, duration_from_start) estimated_frames = int(round(duration_seconds * float(video_stream.average_rate))) if estimated_frames > 0: return estimated_frames # 3. Last resort: decode frames and count them (streaming) - frame_count = 0 - container.seek(0) - for packet in container.demux(video_stream): - for _ in packet.decode(): - frame_count += 1 - - if frame_count == 0: - raise ValueError(f"Could not determine frame count for file '{self.__file}'") + if self.__start_time < 0: + start_time = max(self._get_raw_duration() + self.__start_time, 0) + else: + start_time = self.__start_time + frame_count = 1 + start_pts = int(start_time / video_stream.time_base) + end_pts = int((start_time + self.__duration) / video_stream.time_base) + container.seek(start_pts, stream=video_stream) + frame_iterator = ( + container.decode(video_stream) + if video_stream.codec.capabilities & 0x100 + else container.demux(video_stream) + ) + for frame in frame_iterator: + if frame.pts >= start_pts: + break + else: + raise ValueError(f"Could not determine frame count for file '{self.__file}'\nNo frames exist for start_time {self.__start_time}") + for frame in frame_iterator: + if frame.pts >= end_pts: + break + frame_count += 1 return frame_count def get_frame_rate(self) -> Fraction: @@ -199,9 +233,21 @@ class VideoFromFile(VideoInput): return container.format.name def get_components_internal(self, container: InputContainer) -> VideoComponents: + video_stream = self._get_first_video_stream(container) + if self.__start_time < 0: + start_time = max(self._get_raw_duration() + self.__start_time, 0) + else: + start_time = self.__start_time # Get video frames frames = [] - for frame in container.decode(video=0): + start_pts = int(start_time / video_stream.time_base) + end_pts = int((start_time + self.__duration) / video_stream.time_base) + container.seek(start_pts, stream=video_stream) + for frame in container.decode(video_stream): + if frame.pts < start_pts: + continue + if self.__duration and frame.pts >= end_pts: + break img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) frames.append(img) @@ -209,31 +255,44 @@ class VideoFromFile(VideoInput): images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) # Get frame rate - video_stream = next(s for s in container.streams if s.type == 'video') - frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) + frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1) # Get audio if available audio = None - try: - container.seek(0) # Reset the container to the beginning - for stream in container.streams: - if stream.type != 'audio': - continue - assert isinstance(stream, av.AudioStream) - audio_frames = [] - for packet in container.demux(stream): - for frame in packet.decode(): - assert isinstance(frame, av.AudioFrame) - audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) - if len(audio_frames) > 0: - audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) - audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) - audio = AudioInput({ - "waveform": audio_tensor, - "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, - }) - except StopIteration: - pass # No audio stream + container.seek(start_pts, stream=video_stream) + # Use last stream for consistency + if len(container.streams.audio): + audio_stream = container.streams.audio[-1] + audio_frames = [] + resample = av.audio.resampler.AudioResampler(format='fltp').resample + frames = itertools.chain.from_iterable( + map(resample, container.decode(audio_stream)) + ) + + has_first_frame = False + for frame in frames: + offset_seconds = start_time - frame.pts * audio_stream.time_base + to_skip = int(offset_seconds * audio_stream.sample_rate) + if to_skip < frame.samples: + has_first_frame = True + break + if has_first_frame: + audio_frames.append(frame.to_ndarray()[..., to_skip:]) + + for frame in frames: + if frame.time > start_time + self.__duration: + break + audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) + if len(audio_frames) > 0: + audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) + if self.__duration: + audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)] + + audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) + audio = AudioInput({ + "waveform": audio_tensor, + "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1, + }) metadata = container.metadata return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) @@ -250,7 +309,7 @@ class VideoFromFile(VideoInput): path: str | io.BytesIO, format: VideoContainer = VideoContainer.AUTO, codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None + metadata: Optional[dict] = None, ): if isinstance(self.__file, io.BytesIO): self.__file.seek(0) # Reset the BytesIO object to the beginning @@ -262,15 +321,14 @@ class VideoFromFile(VideoInput): reuse_streams = False if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: reuse_streams = False + if self.__start_time or self.__duration: + reuse_streams = False if not reuse_streams: components = self.get_components_internal(container) video = VideoFromComponents(components) return video.save_to( - path, - format=format, - codec=codec, - metadata=metadata + path, format=format, codec=codec, metadata=metadata ) streams = container.streams @@ -304,10 +362,21 @@ class VideoFromFile(VideoInput): output_container.mux(packet) def _get_first_video_stream(self, container: InputContainer): - video_stream = next((s for s in container.streams if s.type == "video"), None) - if video_stream is None: - raise ValueError(f"No video stream found in file '{self.__file}'") - return video_stream + if len(container.streams.video): + return container.streams.video[0] + raise ValueError(f"No video stream found in file '{self.__file}'") + + def as_trimmed( + self, start_time: float = 0, duration: float = 0, strict_duration: bool = True + ) -> VideoInput | None: + trimmed = VideoFromFile( + self.get_stream_source(), + start_time=start_time + self.__start_time, + duration=duration, + ) + if trimmed.get_duration() < duration and strict_duration: + return None + return trimmed class VideoFromComponents(VideoInput): @@ -322,7 +391,7 @@ class VideoFromComponents(VideoInput): return VideoComponents( images=self.__components.images, audio=self.__components.audio, - frame_rate=self.__components.frame_rate + frame_rate=self.__components.frame_rate, ) def save_to( @@ -330,7 +399,7 @@ class VideoFromComponents(VideoInput): path: str, format: VideoContainer = VideoContainer.AUTO, codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None + metadata: Optional[dict] = None, ): if format != VideoContainer.AUTO and format != VideoContainer.MP4: raise ValueError("Only MP4 format is supported for now") @@ -357,7 +426,10 @@ class VideoFromComponents(VideoInput): audio_stream: Optional[av.AudioStream] = None if self.__components.audio: audio_sample_rate = int(self.__components.audio['sample_rate']) - audio_stream = output.add_stream('aac', rate=audio_sample_rate) + waveform = self.__components.audio['waveform'] + waveform = waveform[0, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])] + layout = {1: 'mono', 2: 'stereo', 6: '5.1'}.get(waveform.shape[0], 'stereo') + audio_stream = output.add_stream('aac', rate=audio_sample_rate, layout=layout) # Encode video for i, frame in enumerate(self.__components.images): @@ -372,12 +444,21 @@ class VideoFromComponents(VideoInput): output.mux(packet) if audio_stream and self.__components.audio: - waveform = self.__components.audio['waveform'] - waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])] - frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().cpu().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo') + frame = av.AudioFrame.from_ndarray(waveform.float().cpu().numpy(), format='fltp', layout=layout) frame.sample_rate = audio_sample_rate frame.pts = 0 output.mux(audio_stream.encode(frame)) # Flush encoder output.mux(audio_stream.encode(None)) + + def as_trimmed( + self, + start_time: float | None = None, + duration: float | None = None, + strict_duration: bool = True, + ) -> VideoInput | None: + if self.get_duration() < start_time + duration: + return None + #TODO Consider tracking duration and trimming at time of save? + return VideoFromFile(self.get_stream_source(), start_time=start_time, duration=duration) diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index ccf7b63d3..cd765a7c1 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -202,6 +202,56 @@ class LoadVideo(io.ComfyNode): return True +class VideoSlice(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="Video Slice", + display_name="Video Slice", + search_aliases=[ + "trim video duration", + "skip first frames", + "frame load cap", + "start time", + ], + category="image/video", + inputs=[ + io.Video.Input("video"), + io.Float.Input( + "start_time", + default=0.0, + max=1e5, + min=-1e5, + step=0.001, + tooltip="Start time in seconds", + ), + io.Float.Input( + "duration", + default=0.0, + min=0.0, + step=0.001, + tooltip="Duration in seconds, or 0 for unlimited duration", + ), + io.Boolean.Input( + "strict_duration", + default=False, + tooltip="If True, when the specified duration is not possible, an error will be raised.", + ), + ], + outputs=[ + io.Video.Output(), + ], + ) + + @classmethod + def execute(cls, video: io.Video.Type, start_time: float, duration: float, strict_duration: bool) -> io.NodeOutput: + trimmed = video.as_trimmed(start_time, duration, strict_duration=strict_duration) + if trimmed is not None: + return io.NodeOutput(trimmed) + raise ValueError( + f"Failed to slice video:\nSource duration: {video.get_duration()}\nStart time: {start_time}\nTarget duration: {duration}" + ) + class VideoExtension(ComfyExtension): @override @@ -212,6 +262,7 @@ class VideoExtension(ComfyExtension): CreateVideo, GetVideoComponents, LoadVideo, + VideoSlice, ] async def comfy_entrypoint() -> VideoExtension: From cdcf4119b3e826bd69fa986772485fb5b44a54cd Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:45:19 +0800 Subject: [PATCH 06/25] [Trainer] training with proper offloading (#12189) * Fix bypass dtype/device moving * Force offloading mode for training * training context var * offloading implementation in training node * fix wrong input type * Support bypass load lora model, correct adapter/offloading handling --- comfy/ldm/flux/math.py | 39 +++++--- comfy/model_management.py | 5 + comfy/sampler_helpers.py | 16 +++- comfy/weight_adapter/bypass.py | 20 ++-- comfy_extras/nodes_train.py | 162 ++++++++++++++++++++++++++++----- 5 files changed, 196 insertions(+), 46 deletions(-) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index f9597de5b..5e764bb46 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -29,19 +29,34 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor: return out.to(dtype=torch.float32, device=pos.device) +def _apply_rope1(x: Tensor, freqs_cis: Tensor): + x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2) + + x_out = freqs_cis[..., 0] * x_[..., 0] + x_out.addcmul_(freqs_cis[..., 1], x_[..., 1]) + + return x_out.reshape(*x.shape).type_as(x) + + +def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): + return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis) + + try: import comfy.quant_ops - apply_rope = comfy.quant_ops.ck.apply_rope - apply_rope1 = comfy.quant_ops.ck.apply_rope1 + q_apply_rope = comfy.quant_ops.ck.apply_rope + q_apply_rope1 = comfy.quant_ops.ck.apply_rope1 + def apply_rope(xq, xk, freqs_cis): + if comfy.model_management.in_training: + return _apply_rope(xq, xk, freqs_cis) + else: + return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis) + def apply_rope1(x, freqs_cis): + if comfy.model_management.in_training: + return _apply_rope1(x, freqs_cis) + else: + return q_apply_rope1(x, freqs_cis) except: logging.warning("No comfy kitchen, using old apply_rope functions.") - def apply_rope1(x: Tensor, freqs_cis: Tensor): - x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2) - - x_out = freqs_cis[..., 0] * x_[..., 0] - x_out.addcmul_(freqs_cis[..., 1], x_[..., 1]) - - return x_out.reshape(*x.shape).type_as(x) - - def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): - return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis) + apply_rope = _apply_rope + apply_rope1 = _apply_rope1 diff --git a/comfy/model_management.py b/comfy/model_management.py index 6018c1ab6..304931eb0 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -55,6 +55,11 @@ cpu_state = CPUState.GPU total_vram = 0 + +# Training Related State +in_training = False + + def get_supported_float8_types(): float8_types = [] try: diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 9134e6d71..1f75f2ba7 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -122,20 +122,26 @@ def estimate_memory(model, noise_shape, conds): minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min) return memory_required, minimum_memory_required -def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False): +def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False): executor = comfy.patcher_extension.WrapperExecutor.new_executor( _prepare_sampling, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True) ) - return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load) + return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload) -def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False): +def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False): real_model: BaseModel = None models, inference_memory = get_additional_models(conds, model.model_dtype()) models += get_additional_models_from_model_options(model_options) models += model.get_nested_additional_models() # TODO: does this require inference_memory update? - memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds) - comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory, force_full_load=force_full_load) + if force_offload: # In training + offload enabled, we want to force prepare sampling to trigger partial load + memory_required = 1e20 + minimum_memory_required = None + else: + memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds) + memory_required += inference_memory + minimum_memory_required += inference_memory + comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load) real_model = model.model return real_model, conds, models diff --git a/comfy/weight_adapter/bypass.py b/comfy/weight_adapter/bypass.py index d4aaf98ca..b9d5ec7d9 100644 --- a/comfy/weight_adapter/bypass.py +++ b/comfy/weight_adapter/bypass.py @@ -21,6 +21,7 @@ from typing import Optional, Union import torch import torch.nn as nn +import comfy.model_management from .base import WeightAdapterBase, WeightAdapterTrainBase from comfy.patcher_extension import PatcherInjection @@ -181,18 +182,21 @@ class BypassForwardHook: ) return # Already injected - # Move adapter weights to module's device to avoid CPU-GPU transfer on every forward - device = None + # Move adapter weights to compute device (GPU) + # Use get_torch_device() instead of module.weight.device because + # with offloading, module weights may be on CPU while compute happens on GPU + device = comfy.model_management.get_torch_device() + + # Get dtype from module weight if available dtype = None if hasattr(self.module, "weight") and self.module.weight is not None: - device = self.module.weight.device dtype = self.module.weight.dtype - elif hasattr(self.module, "W_q"): # Quantized layers might use different attr - device = self.module.W_q.device - dtype = self.module.W_q.dtype - if device is not None: - self._move_adapter_weights_to_device(device, dtype) + # Only use dtype if it's a standard float type, not quantized + if dtype is not None and dtype not in (torch.float32, torch.float16, torch.bfloat16): + dtype = None + + self._move_adapter_weights_to_device(device, dtype) self.original_forward = self.module.forward self.module.forward = self._bypass_forward diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 024a89391..630eedc9f 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -4,6 +4,7 @@ import os import numpy as np import safetensors import torch +import torch.nn as nn import torch.utils.checkpoint from tqdm.auto import trange from PIL import Image, ImageDraw, ImageFont @@ -27,6 +28,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic): """ CFGGuider with modifications for training specific logic """ + + def __init__(self, *args, offloading=False, **kwargs): + super().__init__(*args, **kwargs) + self.offloading = offloading + def outer_sample( self, noise, @@ -45,9 +51,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic): noise.shape, self.conds, self.model_options, - force_full_load=True, # mirror behavior in TrainLoraNode.execute() to keep model loaded + force_full_load=not self.offloading, + force_offload=self.offloading, ) ) + torch.cuda.empty_cache() device = self.model_patcher.load_device if denoise_mask is not None: @@ -404,16 +412,97 @@ def find_all_highest_child_module_with_forward( return result -def patch(m): +def find_modules_at_depth( + model: nn.Module, depth: int = 1, result=None, current_depth=0, name=None +) -> list[nn.Module]: + """ + Find modules at a specific depth level for gradient checkpointing. + + Args: + model: The model to search + depth: Target depth level (1 = top-level blocks, 2 = their children, etc.) + result: Accumulator for results + current_depth: Current recursion depth + name: Current module name for logging + + Returns: + List of modules at the target depth + """ + if result is None: + result = [] + name = name or "root" + + # Skip container modules (they don't have meaningful forward) + is_container = isinstance(model, (nn.ModuleList, nn.Sequential, nn.ModuleDict)) + has_forward = hasattr(model, "forward") and not is_container + + if has_forward: + current_depth += 1 + if current_depth == depth: + result.append(model) + logging.debug(f"Found module at depth {depth}: {name} ({model.__class__.__name__})") + return result + + # Recurse into children + for next_name, child in model.named_children(): + find_modules_at_depth(child, depth, result, current_depth, f"{name}.{next_name}") + + return result + + +class OffloadCheckpointFunction(torch.autograd.Function): + """ + Gradient checkpointing that works with weight offloading. + + Forward: no_grad -> compute -> weights can be freed + Backward: enable_grad -> recompute -> backward -> weights can be freed + + For single input, single output modules (Linear, Conv*). + """ + + @staticmethod + def forward(ctx, x: torch.Tensor, forward_fn): + ctx.save_for_backward(x) + ctx.forward_fn = forward_fn + with torch.no_grad(): + return forward_fn(x) + + @staticmethod + def backward(ctx, grad_out: torch.Tensor): + x, = ctx.saved_tensors + forward_fn = ctx.forward_fn + + # Clear context early + ctx.forward_fn = None + + with torch.enable_grad(): + x_detached = x.detach().requires_grad_(True) + y = forward_fn(x_detached) + y.backward(grad_out) + grad_x = x_detached.grad + + # Explicit cleanup + del y, x_detached, forward_fn + + return grad_x, None + + +def patch(m, offloading=False): if not hasattr(m, "forward"): return org_forward = m.forward - def fwd(args, kwargs): - return org_forward(*args, **kwargs) + # Branch 1: Linear/Conv* -> offload-compatible checkpoint (single input/output) + if offloading and isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): + def checkpointing_fwd(x): + return OffloadCheckpointFunction.apply(x, org_forward) + # Branch 2: Others -> standard checkpoint + else: + def fwd(args, kwargs): + return org_forward(*args, **kwargs) - def checkpointing_fwd(*args, **kwargs): - return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False) + def checkpointing_fwd(*args, **kwargs): + return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False) m.org_forward = org_forward m.forward = checkpointing_fwd @@ -936,6 +1025,18 @@ class TrainLoraNode(io.ComfyNode): default=True, tooltip="Use gradient checkpointing for training.", ), + io.Int.Input( + "checkpoint_depth", + default=1, + min=1, + max=5, + tooltip="Depth level for gradient checkpointing.", + ), + io.Boolean.Input( + "offloading", + default=False, + tooltip="Depth level for gradient checkpointing.", + ), io.Combo.Input( "existing_lora", options=folder_paths.get_filename_list("loras") + ["[None]"], @@ -982,6 +1083,8 @@ class TrainLoraNode(io.ComfyNode): lora_dtype, algorithm, gradient_checkpointing, + checkpoint_depth, + offloading, existing_lora, bucket_mode, bypass_mode, @@ -1000,6 +1103,8 @@ class TrainLoraNode(io.ComfyNode): lora_dtype = lora_dtype[0] algorithm = algorithm[0] gradient_checkpointing = gradient_checkpointing[0] + offloading = offloading[0] + checkpoint_depth = checkpoint_depth[0] existing_lora = existing_lora[0] bucket_mode = bucket_mode[0] bypass_mode = bypass_mode[0] @@ -1054,16 +1159,18 @@ class TrainLoraNode(io.ComfyNode): # Setup gradient checkpointing if gradient_checkpointing: - for m in find_all_highest_child_module_with_forward( - mp.model.diffusion_model - ): - patch(m) + modules_to_patch = find_modules_at_depth( + mp.model.diffusion_model, depth=checkpoint_depth + ) + logging.info(f"Gradient checkpointing: patching {len(modules_to_patch)} modules at depth {checkpoint_depth}") + for m in modules_to_patch: + patch(m, offloading=offloading) torch.cuda.empty_cache() # With force_full_load=False we should be able to have offloading # But for offloading in training we need custom AutoGrad hooks for fwd/bwd comfy.model_management.load_models_gpu( - [mp], memory_required=1e20, force_full_load=True + [mp], memory_required=1e20, force_full_load=not offloading ) torch.cuda.empty_cache() @@ -1100,7 +1207,7 @@ class TrainLoraNode(io.ComfyNode): ) # Setup guider - guider = TrainGuider(mp) + guider = TrainGuider(mp, offloading=offloading) guider.set_conds(positive) # Inject bypass hooks if bypass mode is enabled @@ -1113,6 +1220,7 @@ class TrainLoraNode(io.ComfyNode): # Run training loop try: + comfy.model_management.in_training = True _run_training_loop( guider, train_sampler, @@ -1123,6 +1231,7 @@ class TrainLoraNode(io.ComfyNode): multi_res, ) finally: + comfy.model_management.in_training = False # Eject bypass hooks if they were injected if bypass_injections is not None: for injection in bypass_injections: @@ -1132,19 +1241,20 @@ class TrainLoraNode(io.ComfyNode): unpatch(m) del train_sampler, optimizer - # Finalize adapters + for param in lora_sd: + lora_sd[param] = lora_sd[param].to(lora_dtype).detach() + for adapter in all_weight_adapters: adapter.requires_grad_(False) - - for param in lora_sd: - lora_sd[param] = lora_sd[param].to(lora_dtype) + del adapter + del all_weight_adapters # mp in train node is highly specialized for training # use it in inference will result in bad behavior so we don't return it return io.NodeOutput(lora_sd, loss_map, steps + existing_steps) -class LoraModelLoader(io.ComfyNode):# +class LoraModelLoader(io.ComfyNode): @classmethod def define_schema(cls): return io.Schema( @@ -1166,6 +1276,11 @@ class LoraModelLoader(io.ComfyNode):# max=100.0, tooltip="How strongly to modify the diffusion model. This value can be negative.", ), + io.Boolean.Input( + "bypass", + default=False, + tooltip="When enabled, applies LoRA in bypass mode without modifying base model weights. Useful for training and when model weights are offloaded.", + ), ], outputs=[ io.Model.Output( @@ -1175,13 +1290,18 @@ class LoraModelLoader(io.ComfyNode):# ) @classmethod - def execute(cls, model, lora, strength_model): + def execute(cls, model, lora, strength_model, bypass=False): if strength_model == 0: return io.NodeOutput(model) - model_lora, _ = comfy.sd.load_lora_for_models( - model, None, lora, strength_model, 0 - ) + if bypass: + model_lora, _ = comfy.sd.load_bypass_lora_for_models( + model, None, lora, strength_model, 0 + ) + else: + model_lora, _ = comfy.sd.load_lora_for_models( + model, None, lora, strength_model, 0 + ) return io.NodeOutput(model_lora) From 76a7fa96dbdc2eda89218601fe3aed5997df055f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 10 Feb 2026 19:04:32 -0800 Subject: [PATCH 07/25] Make built in lora training work on anima. (#12402) --- comfy/ldm/anima/model.py | 16 ++++++++++++++-- comfy/model_base.py | 12 ++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/anima/model.py b/comfy/ldm/anima/model.py index 2e6ed58fa..6fb51c4a4 100644 --- a/comfy/ldm/anima/model.py +++ b/comfy/ldm/anima/model.py @@ -195,8 +195,20 @@ class Anima(MiniTrainDIT): super().__init__(*args, **kwargs) self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations")) - def preprocess_text_embeds(self, text_embeds, text_ids): + def preprocess_text_embeds(self, text_embeds, text_ids, t5xxl_weights=None): if text_ids is not None: - return self.llm_adapter(text_embeds, text_ids) + out = self.llm_adapter(text_embeds, text_ids) + if t5xxl_weights is not None: + out = out * t5xxl_weights + + if out.shape[1] < 512: + out = torch.nn.functional.pad(out, (0, 0, 0, 512 - out.shape[1])) + return out else: return text_embeds + + def forward(self, x, timesteps, context, **kwargs): + t5xxl_ids = kwargs.pop("t5xxl_ids", None) + if t5xxl_ids is not None: + context = self.preprocess_text_embeds(context, t5xxl_ids, t5xxl_weights=kwargs.pop("t5xxl_weights", None)) + return super().forward(x, timesteps, context, **kwargs) diff --git a/comfy/model_base.py b/comfy/model_base.py index 858789b30..4a74cb1ce 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1160,12 +1160,16 @@ class Anima(BaseModel): device = kwargs["device"] if cross_attn is not None: if t5xxl_ids is not None: - cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.unsqueeze(0).to(device=device)) if t5xxl_weights is not None: - cross_attn *= t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn) + t5xxl_weights = t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn) + t5xxl_ids = t5xxl_ids.unsqueeze(0) + + if torch.is_inference_mode_enabled(): # if not we are training + cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype())) + else: + out['t5xxl_ids'] = comfy.conds.CONDRegular(t5xxl_ids) + out['t5xxl_weights'] = comfy.conds.CONDRegular(t5xxl_weights) - if cross_attn.shape[1] < 512: - cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, 0, 512 - cross_attn.shape[1])) out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) return out From 2c7cef4a23c08e3f02a33c693d927158a15a11f6 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 11 Feb 2026 20:51:49 +0200 Subject: [PATCH 08/25] fix(api-nodes): retry on connection errors during polling instead of aborting (#12393) --- comfy_api_nodes/util/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 8a1259506..391748e7a 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -143,9 +143,9 @@ async def poll_op( poll_interval: float = 5.0, max_poll_attempts: int = 160, timeout_per_poll: float = 120.0, - max_retries_per_poll: int = 3, + max_retries_per_poll: int = 10, retry_delay_per_poll: float = 1.0, - retry_backoff_per_poll: float = 2.0, + retry_backoff_per_poll: float = 1.4, estimated_duration: int | None = None, cancel_endpoint: ApiEndpoint | None = None, cancel_timeout: float = 10.0, @@ -240,9 +240,9 @@ async def poll_op_raw( poll_interval: float = 5.0, max_poll_attempts: int = 160, timeout_per_poll: float = 120.0, - max_retries_per_poll: int = 3, + max_retries_per_poll: int = 10, retry_delay_per_poll: float = 1.0, - retry_backoff_per_poll: float = 2.0, + retry_backoff_per_poll: float = 1.4, estimated_duration: int | None = None, cancel_endpoint: ApiEndpoint | None = None, cancel_timeout: float = 10.0, From 4993411fd9a43d642971925272c3748d9e058131 Mon Sep 17 00:00:00 2001 From: Benjamin Lu Date: Wed, 11 Feb 2026 11:15:13 -0800 Subject: [PATCH 09/25] Dispatch desktop auto-bump when a ComfyUI release is published (#12398) * Dispatch desktop auto-bump on ComfyUI release publish * Fix release webhook secret checks in step conditions * Require desktop dispatch token in release webhook * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Luke Mino-Altherr Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Jedrzej Kosinski --- .github/workflows/release-webhook.yml | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/.github/workflows/release-webhook.yml b/.github/workflows/release-webhook.yml index 6fceb7560..737e4c488 100644 --- a/.github/workflows/release-webhook.yml +++ b/.github/workflows/release-webhook.yml @@ -7,6 +7,8 @@ on: jobs: send-webhook: runs-on: ubuntu-latest + env: + DESKTOP_REPO_DISPATCH_TOKEN: ${{ secrets.DESKTOP_REPO_DISPATCH_TOKEN }} steps: - name: Send release webhook env: @@ -106,3 +108,37 @@ jobs: --fail --silent --show-error echo "✅ Release webhook sent successfully" + + - name: Send repository dispatch to desktop + env: + DISPATCH_TOKEN: ${{ env.DESKTOP_REPO_DISPATCH_TOKEN }} + RELEASE_TAG: ${{ github.event.release.tag_name }} + RELEASE_URL: ${{ github.event.release.html_url }} + run: | + set -euo pipefail + + if [ -z "${DISPATCH_TOKEN:-}" ]; then + echo "::error::DESKTOP_REPO_DISPATCH_TOKEN is required but not set." + exit 1 + fi + + PAYLOAD="$(jq -n \ + --arg release_tag "$RELEASE_TAG" \ + --arg release_url "$RELEASE_URL" \ + '{ + event_type: "comfyui_release_published", + client_payload: { + release_tag: $release_tag, + release_url: $release_url + } + }')" + + curl -fsSL \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${DISPATCH_TOKEN}" \ + https://api.github.com/repos/Comfy-Org/desktop/dispatches \ + -d "$PAYLOAD" + + echo "✅ Dispatched ComfyUI release ${RELEASE_TAG} to Comfy-Org/desktop" From 2b7cc7e3b69127a81b9232d4e8305eb678fa3d0c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 11 Feb 2026 21:30:19 +0200 Subject: [PATCH 10/25] [API Nodes] enable Magnific Upscalers (#12179) * feat(api-nodes): enable Magnific Upscalers * update price badges --------- Co-authored-by: Jedrzej Kosinski --- comfy_api_nodes/nodes_magnific.py | 62 +++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py index 013e71cc8..83a581c5d 100644 --- a/comfy_api_nodes/nodes_magnific.py +++ b/comfy_api_nodes/nodes_magnific.py @@ -30,6 +30,30 @@ from comfy_api_nodes.util import ( validate_image_dimensions, ) +_EUR_TO_USD = 1.19 + + +def _tier_price_eur(megapixels: float) -> float: + """Price in EUR for a single Magnific upscaling step based on input megapixels.""" + if megapixels <= 1.3: + return 0.143 + if megapixels <= 3.0: + return 0.286 + if megapixels <= 6.4: + return 0.429 + return 1.716 + + +def _calculate_magnific_upscale_price_usd(width: int, height: int, scale: int) -> float: + """Calculate total Magnific upscale price in USD for given input dimensions and scale factor.""" + num_steps = int(math.log2(scale)) + total_eur = 0.0 + pixels = width * height + for _ in range(num_steps): + total_eur += _tier_price_eur(pixels / 1_000_000) + pixels *= 4 + return round(total_eur * _EUR_TO_USD, 2) + class MagnificImageUpscalerCreativeNode(IO.ComfyNode): @classmethod @@ -103,11 +127,20 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]), + depends_on=IO.PriceBadgeDepends(widgets=["scale_factor", "auto_downscale"]), expr=""" ( - $max := widgets.scale_factor = "2x" ? 1.326 : 1.657; - {"type": "range_usd", "min_usd": 0.11, "max_usd": $max} + $ad := widgets.auto_downscale; + $mins := $ad + ? {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.515} + : {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.844}; + $maxs := {"2x": 0.515, "4x": 0.844, "8x": 1.015, "16x": 1.187}; + { + "type": "range_usd", + "min_usd": $lookup($mins, widgets.scale_factor), + "max_usd": $lookup($maxs, widgets.scale_factor), + "format": { "approximate": true } + } ) """, ), @@ -168,6 +201,10 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode): f"Use a smaller input image or lower scale factor." ) + final_height, final_width = get_image_dimensions(image) + actual_scale = int(scale_factor.rstrip("x")) + price_usd = _calculate_magnific_upscale_price_usd(final_width, final_height, actual_scale) + initial_res = await sync_op( cls, ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler", method="POST"), @@ -189,6 +226,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode): ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler/{initial_res.task_id}"), response_model=TaskResponse, status_extractor=lambda x: x.status, + price_extractor=lambda _: price_usd, poll_interval=10.0, max_poll_attempts=480, ) @@ -257,8 +295,14 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): depends_on=IO.PriceBadgeDepends(widgets=["scale_factor"]), expr=""" ( - $max := widgets.scale_factor = "2x" ? 1.326 : 1.657; - {"type": "range_usd", "min_usd": 0.11, "max_usd": $max} + $mins := {"2x": 0.172, "4x": 0.343, "8x": 0.515, "16x": 0.844}; + $maxs := {"2x": 2.045, "4x": 2.545, "8x": 2.889, "16x": 3.06}; + { + "type": "range_usd", + "min_usd": $lookup($mins, widgets.scale_factor), + "max_usd": $lookup($maxs, widgets.scale_factor), + "format": { "approximate": true } + } ) """, ), @@ -321,6 +365,9 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): f"Use a smaller input image or lower scale factor." ) + final_height, final_width = get_image_dimensions(image) + price_usd = _calculate_magnific_upscale_price_usd(final_width, final_height, requested_scale) + initial_res = await sync_op( cls, ApiEndpoint(path="/proxy/freepik/v1/ai/image-upscaler-precision-v2", method="POST"), @@ -339,6 +386,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): ApiEndpoint(path=f"/proxy/freepik/v1/ai/image-upscaler-precision-v2/{initial_res.task_id}"), response_model=TaskResponse, status_extractor=lambda x: x.status, + price_extractor=lambda _: price_usd, poll_interval=10.0, max_poll_attempts=480, ) @@ -877,8 +925,8 @@ class MagnificExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ - # MagnificImageUpscalerCreativeNode, - # MagnificImageUpscalerPreciseV2Node, + MagnificImageUpscalerCreativeNode, + MagnificImageUpscalerPreciseV2Node, MagnificImageStyleTransferNode, MagnificImageRelightNode, MagnificImageSkinEnhancerNode, From d297a749a2fa3a34ebff898797feef161bcd64c6 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:50:16 -0800 Subject: [PATCH 11/25] dynamic_vram: Fix windows Aimdo crash + Fix LLM performance (#12408) * model_management: lazy-cache aimdo_tensor These tensors cosntructed from aimdo-allocations are CPU expensive to make on the pytorch side. Add a cache version that will be valid with signature match to fast path past whatever torch is doing. * dynamic_vram: Minimize fast path CPU work Move as much as possible inside the not resident if block and cache the formed weight and bias rather than the flat intermediates. In extreme layer weight rates this adds up. --- comfy/model_management.py | 8 ++++++-- comfy/model_patcher.py | 2 -- comfy/ops.py | 21 ++++++++++++++------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 304931eb0..38c3e482b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1213,8 +1213,12 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str signature = comfy_aimdo.model_vbar.vbar_fault(weight._v) if signature is not None: - v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0] - if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature): + if comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature): + v_tensor = weight._v_tensor + else: + raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device) + v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0] + weight._v_tensor = v_tensor weight._v_signature = signature #Send it over v_tensor.copy_(weight, non_blocking=non_blocking) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 19c9031ea..224e218e3 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1542,7 +1542,6 @@ class ModelPatcherDynamic(ModelPatcher): if vbar is not None and not hasattr(m, "_v"): m._v = vbar.alloc(v_weight_size) - m._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(m._v, device_to) allocated_size += v_weight_size else: @@ -1557,7 +1556,6 @@ class ModelPatcherDynamic(ModelPatcher): weight_size = geometry.numel() * geometry.element_size() if vbar is not None and not hasattr(weight, "_v"): weight._v = vbar.alloc(weight_size) - weight._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device_to) weight._model_dtype = model_dtype allocated_size += weight_size vbar.set_watermark_limit(allocated_size) diff --git a/comfy/ops.py b/comfy/ops.py index 33803b223..688937e43 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -83,14 +83,18 @@ def cast_to_input(weight, input, non_blocking=False, copy=True): def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype): offload_stream = None xfer_dest = None - cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ]) signature = comfy_aimdo.model_vbar.vbar_fault(s._v) - if signature is not None: - xfer_dest = s._v_tensor resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) + if signature is not None: + if resident: + weight = s._v_weight + bias = s._v_bias + else: + xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device) if not resident: + cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ]) cast_dest = None xfer_source = [ s.weight, s.bias ] @@ -140,9 +144,13 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu post_cast.copy_(pre_cast) xfer_dest = cast_dest - params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest) - weight = params[0] - bias = params[1] + params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest) + weight = params[0] + bias = params[1] + if signature is not None: + s._v_weight = weight + s._v_bias = bias + s._v_signature=signature def post_cast(s, param_key, x, dtype, resident, update_weight): lowvram_fn = getattr(s, param_key + "_lowvram_function", None) @@ -182,7 +190,6 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu weight = post_cast(s, "weight", weight, dtype, resident, update_weight) if s.bias is not None: bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight) - s._v_signature=signature #FIXME: weird offload return protocol return weight, bias, (offload_stream, device if signature is not None else None, None) From 2a4328d639810858aa625c7bfedb974a13a57abe Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:53:42 -0800 Subject: [PATCH 12/25] ace15: Use dynamic_vram friendly trange (#12409) Factor out the ksampler trange and use it in ACE LLM to prevent the silent stall at 0 and rate distortion due to first-step model load. --- comfy/k_diffusion/sampling.py | 32 ++------------------------------ comfy/text_encoders/ace15.py | 3 +-- comfy/utils.py | 27 +++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index c0c51d51a..6978eb717 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1,12 +1,11 @@ import math -import time from functools import partial from scipy import integrate import torch from torch import nn import torchsde -from tqdm.auto import trange as trange_, tqdm +from tqdm.auto import tqdm from . import utils from . import deis @@ -15,34 +14,7 @@ import comfy.model_patcher import comfy.model_sampling import comfy.memory_management - - -def trange(*args, **kwargs): - if comfy.memory_management.aimdo_allocator is None: - return trange_(*args, **kwargs) - - pbar = trange_(*args, **kwargs, smoothing=1.0) - pbar._i = 0 - pbar.set_postfix_str(" Model Initializing ... ") - - _update = pbar.update - - def warmup_update(n=1): - pbar._i += 1 - if pbar._i == 1: - pbar.i1_time = time.time() - pbar.set_postfix_str(" Model Initialization complete! ") - elif pbar._i == 2: - #bring forward the effective start time based the the diff between first and second iteration - #to attempt to remove load overhead from the final step rate estimate. - pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time) - pbar.set_postfix_str("") - - _update(n) - - pbar.update = warmup_update - return pbar - +from comfy.utils import model_trange as trange def append_zero(x): return torch.cat([x, x.new_zeros([1])]) diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py index 73697b3c1..b8198a820 100644 --- a/comfy/text_encoders/ace15.py +++ b/comfy/text_encoders/ace15.py @@ -3,7 +3,6 @@ import comfy.text_encoders.llama from comfy import sd1_clip import torch import math -from tqdm.auto import trange import yaml import comfy.utils @@ -52,7 +51,7 @@ def sample_manual_loop_no_classes( progress_bar = comfy.utils.ProgressBar(max_new_tokens) - for step in trange(max_new_tokens, desc="LM sampling"): + for step in comfy.utils.model_trange(max_new_tokens, desc="LM sampling"): outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values) next_token_logits = model.transformer.logits(outputs[0])[:, -1] past_key_values = outputs[2] diff --git a/comfy/utils.py b/comfy/utils.py index edd80cebe..e0a94e2e1 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -27,6 +27,7 @@ from PIL import Image import logging import itertools from torch.nn.functional import interpolate +from tqdm.auto import trange from einops import rearrange from comfy.cli_args import args, enables_dynamic_vram import json @@ -1155,6 +1156,32 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am def tiled_scale(samples, function, tile_x=64, tile_y=64, overlap = 8, upscale_amount = 4, out_channels = 3, output_device="cpu", pbar = None): return tiled_scale_multidim(samples, function, (tile_y, tile_x), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=output_device, pbar=pbar) +def model_trange(*args, **kwargs): + if comfy.memory_management.aimdo_allocator is None: + return trange(*args, **kwargs) + + pbar = trange(*args, **kwargs, smoothing=1.0) + pbar._i = 0 + pbar.set_postfix_str(" Model Initializing ... ") + + _update = pbar.update + + def warmup_update(n=1): + pbar._i += 1 + if pbar._i == 1: + pbar.i1_time = time.time() + pbar.set_postfix_str(" Model Initialization complete! ") + elif pbar._i == 2: + #bring forward the effective start time based the the diff between first and second iteration + #to attempt to remove load overhead from the final step rate estimate. + pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time) + pbar.set_postfix_str("") + + _update(n) + + pbar.update = warmup_update + return pbar + PROGRESS_BAR_ENABLED = True def set_progress_bar_enabled(enabled): global PROGRESS_BAR_ENABLED From 3fe61cedda090c744dcf6f579ed48744fa66ef5f Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:54:02 -0800 Subject: [PATCH 13/25] model_patcher: guard against none model_dtype (#12410) Handle the case where the _model_dtype exists but is none with the intended fallback. --- comfy/model_patcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 224e218e3..f278fccac 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1525,7 +1525,7 @@ class ModelPatcherDynamic(ModelPatcher): setattr(m, param_key + "_function", weight_function) geometry = weight if not isinstance(weight, QuantizedTensor): - model_dtype = getattr(m, param_key + "_comfy_model_dtype", weight.dtype) + model_dtype = getattr(m, param_key + "_comfy_model_dtype", None) or weight.dtype weight._model_dtype = model_dtype geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype) return comfy.memory_management.vram_aligned_size(geometry) @@ -1551,7 +1551,7 @@ class ModelPatcherDynamic(ModelPatcher): weight.seed_key = key set_dirty(weight, dirty) geometry = weight - model_dtype = getattr(m, param + "_comfy_model_dtype", weight.dtype) + model_dtype = getattr(m, param + "_comfy_model_dtype", None) or weight.dtype geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype) weight_size = geometry.numel() * geometry.element_size() if vbar is not None and not hasattr(weight, "_v"): From e5ae670a4016d3698a806e7f840fcecc50639848 Mon Sep 17 00:00:00 2001 From: askmyteapot <62238146+askmyteapot@users.noreply.github.com> Date: Thu, 12 Feb 2026 11:28:48 +1000 Subject: [PATCH 14/25] Update ace15.py to allow min_p sampling (#12373) --- comfy/text_encoders/ace15.py | 15 ++++++++++++--- comfy_extras/nodes_ace.py | 5 +++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py index b8198a820..0fdd4669f 100644 --- a/comfy/text_encoders/ace15.py +++ b/comfy/text_encoders/ace15.py @@ -16,6 +16,7 @@ def sample_manual_loop_no_classes( temperature: float = 0.85, top_p: float = 0.9, top_k: int = None, + min_p: float = 0.000, seed: int = 1, min_tokens: int = 1, max_new_tokens: int = 2048, @@ -80,6 +81,12 @@ def sample_manual_loop_no_classes( min_val = top_k_vals[..., -1, None] cfg_logits[cfg_logits < min_val] = remove_logit_value + if min_p is not None and min_p > 0: + probs = torch.softmax(cfg_logits, dim=-1) + p_max = probs.max(dim=-1, keepdim=True).values + indices_to_remove = probs < (min_p * p_max) + cfg_logits[indices_to_remove] = remove_logit_value + if top_p is not None and top_p < 1.0: sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True) cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) @@ -110,7 +117,7 @@ def sample_manual_loop_no_classes( return output_audio_codes -def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0): +def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0, min_p=0.000): positive = [[token for token, _ in inner_list] for inner_list in positive] positive = positive[0] @@ -134,7 +141,7 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102 paddings = [] ids = [positive] - return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) + return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) class ACE15Tokenizer(sd1_clip.SD1Tokenizer): @@ -192,6 +199,7 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer): temperature = kwargs.get("temperature", 0.85) top_p = kwargs.get("top_p", 0.9) top_k = kwargs.get("top_k", 0.0) + min_p = kwargs.get("min_p", 0.000) duration = math.ceil(duration) kwargs["duration"] = duration @@ -239,6 +247,7 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer): "temperature": temperature, "top_p": top_p, "top_k": top_k, + "min_p": min_p, } return out @@ -299,7 +308,7 @@ class ACE15TEModel(torch.nn.Module): lm_metadata = token_weight_pairs["lm_metadata"] if lm_metadata["generate_audio_codes"]: - audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["max_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"]) + audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"], min_p=lm_metadata["min_p"]) out["audio_codes"] = [audio_codes] return base_out, None, out diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index dde5bbd2a..9cf84ab4d 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -49,13 +49,14 @@ class TextEncodeAceStepAudio15(io.ComfyNode): io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True), io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True), io.Int.Input("top_k", default=0, min=0, max=100, advanced=True), + io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True), ], outputs=[io.Conditioning.Output()], ) @classmethod - def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k) -> io.NodeOutput: - tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k) + def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput: + tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p) conditioning = clip.encode_from_tokens_scheduled(tokens) return io.NodeOutput(conditioning) From 66c18522fbcde5b62731e3fb080a84b14e3dacfc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 11 Feb 2026 19:12:16 -0800 Subject: [PATCH 15/25] Add a tip for common error. (#12414) --- execution.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/execution.py b/execution.py index 896862c6b..f549a2f0f 100644 --- a/execution.py +++ b/execution.py @@ -623,6 +623,8 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, logging.info("Memory summary: {}".format(comfy.model_management.debug_memory_summary())) logging.error("Got an OOM, unloading all loaded models.") comfy.model_management.unload_all_models() + elif isinstance(ex, RuntimeError) and ("mat1 and mat2 shapes" in str(ex)) and "Sampler" in class_type: + tips = "\n\nTIPS: If you have any \"Load CLIP\" or \"*CLIP Loader\" nodes in your workflow connected to this sampler node make sure the correct file(s) and type is selected." error_details = { "node_id": real_node_id, From 4a93a62371b64f9d11a140a09faf985c48902d2e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 12 Feb 2026 11:38:51 +0200 Subject: [PATCH 16/25] fix(api-nodes): add separate retry budget for 429 rate limit responses (#12421) --- comfy_api_nodes/util/client.py | 207 ++++++++++++----------- comfy_api_nodes/util/download_helpers.py | 30 ++-- comfy_api_nodes/util/request_logger.py | 66 ++++---- comfy_api_nodes/util/upload_helpers.py | 55 +++--- 4 files changed, 177 insertions(+), 181 deletions(-) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 391748e7a..94886af7b 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -57,6 +57,7 @@ class _RequestConfig: files: dict[str, Any] | list[tuple[str, Any]] | None multipart_parser: Callable | None max_retries: int + max_retries_on_rate_limit: int retry_delay: float retry_backoff: float wait_label: str = "Waiting" @@ -65,6 +66,7 @@ class _RequestConfig: final_label_on_success: str | None = "Completed" progress_origin_ts: float | None = None price_extractor: Callable[[dict[str, Any]], float | None] | None = None + is_rate_limited: Callable[[int, Any], bool] | None = None @dataclass @@ -78,7 +80,7 @@ class _PollUIState: active_since: float | None = None # start time of current active interval (None if queued) -_RETRY_STATUS = {408, 429, 500, 502, 503, 504} +_RETRY_STATUS = {408, 500, 502, 503, 504} # status 429 is handled separately COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"] FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"] QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing"] @@ -103,6 +105,8 @@ async def sync_op( final_label_on_success: str | None = "Completed", progress_origin_ts: float | None = None, monitor_progress: bool = True, + max_retries_on_rate_limit: int = 16, + is_rate_limited: Callable[[int, Any], bool] | None = None, ) -> M: raw = await sync_op_raw( cls, @@ -122,6 +126,8 @@ async def sync_op( final_label_on_success=final_label_on_success, progress_origin_ts=progress_origin_ts, monitor_progress=monitor_progress, + max_retries_on_rate_limit=max_retries_on_rate_limit, + is_rate_limited=is_rate_limited, ) if not isinstance(raw, dict): raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).") @@ -194,6 +200,8 @@ async def sync_op_raw( final_label_on_success: str | None = "Completed", progress_origin_ts: float | None = None, monitor_progress: bool = True, + max_retries_on_rate_limit: int = 16, + is_rate_limited: Callable[[int, Any], bool] | None = None, ) -> dict[str, Any] | bytes: """ Make a single network request. @@ -222,6 +230,8 @@ async def sync_op_raw( final_label_on_success=final_label_on_success, progress_origin_ts=progress_origin_ts, price_extractor=price_extractor, + max_retries_on_rate_limit=max_retries_on_rate_limit, + is_rate_limited=is_rate_limited, ) return await _request_base(cfg, expect_binary=as_binary) @@ -506,7 +516,7 @@ def _friendly_http_message(status: int, body: Any) -> str: if status == 409: return "There is a problem with your account. Please contact support@comfy.org." if status == 429: - return "Rate Limit Exceeded: Please try again later." + return "Rate Limit Exceeded: The server returned 429 after all retry attempts. Please wait and try again." try: if isinstance(body, dict): err = body.get("error") @@ -586,6 +596,8 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): start_time = cfg.progress_origin_ts if cfg.progress_origin_ts is not None else time.monotonic() attempt = 0 delay = cfg.retry_delay + rate_limit_attempts = 0 + rate_limit_delay = cfg.retry_delay operation_succeeded: bool = False final_elapsed_seconds: int | None = None extracted_price: float | None = None @@ -653,17 +665,14 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): payload_headers["Content-Type"] = "application/json" payload_kw["json"] = cfg.data or {} - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - request_headers=dict(payload_headers) if payload_headers else None, - request_params=dict(params) if params else None, - request_data=request_body_log, - ) - except Exception as _log_e: - logging.debug("[DEBUG] request logging failed: %s", _log_e) + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + ) req_coro = sess.request(method, url, params=params, **payload_kw) req_task = asyncio.create_task(req_coro) @@ -688,41 +697,33 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): body = await resp.json() except (ContentTypeError, json.JSONDecodeError): body = await resp.text() - if resp.status in _RETRY_STATUS and attempt <= cfg.max_retries: + should_retry = False + wait_time = 0.0 + retry_label = "" + is_rl = resp.status == 429 or ( + cfg.is_rate_limited is not None and cfg.is_rate_limited(resp.status, body) + ) + if is_rl and rate_limit_attempts < cfg.max_retries_on_rate_limit: + rate_limit_attempts += 1 + wait_time = min(rate_limit_delay, 30.0) + rate_limit_delay *= cfg.retry_backoff + retry_label = f"rate-limit retry {rate_limit_attempts} of {cfg.max_retries_on_rate_limit}" + should_retry = True + elif resp.status in _RETRY_STATUS and (attempt - rate_limit_attempts) <= cfg.max_retries: + wait_time = delay + delay *= cfg.retry_backoff + retry_label = f"retry {attempt - rate_limit_attempts} of {cfg.max_retries}" + should_retry = True + + if should_retry: logging.warning( - "HTTP %s %s -> %s. Retrying in %.2fs (retry %d of %d).", + "HTTP %s %s -> %s. Waiting %.2fs (%s).", method, url, resp.status, - delay, - attempt, - cfg.max_retries, + wait_time, + retry_label, ) - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=body, - error_message=_friendly_http_message(resp.status, body), - ) - except Exception as _log_e: - logging.debug("[DEBUG] response logging failed: %s", _log_e) - - await sleep_with_interrupt( - delay, - cfg.node_cls, - cfg.wait_label if cfg.monitor_progress else None, - start_time if cfg.monitor_progress else None, - cfg.estimated_total, - display_callback=_display_time_progress if cfg.monitor_progress else None, - ) - delay *= cfg.retry_backoff - continue - msg = _friendly_http_message(resp.status, body) - try: request_logger.log_request_response( operation_id=operation_id, request_method=method, @@ -730,10 +731,27 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): response_status_code=resp.status, response_headers=dict(resp.headers), response_content=body, - error_message=msg, + error_message=f"HTTP {resp.status} ({retry_label}, will retry in {wait_time:.1f}s)", ) - except Exception as _log_e: - logging.debug("[DEBUG] response logging failed: %s", _log_e) + await sleep_with_interrupt( + wait_time, + cfg.node_cls, + cfg.wait_label if cfg.monitor_progress else None, + start_time if cfg.monitor_progress else None, + cfg.estimated_total, + display_callback=_display_time_progress if cfg.monitor_progress else None, + ) + continue + msg = _friendly_http_message(resp.status, body) + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=msg, + ) raise Exception(msg) if expect_binary: @@ -753,17 +771,14 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): bytes_payload = bytes(buff) operation_succeeded = True final_elapsed_seconds = int(time.monotonic() - start_time) - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=bytes_payload, - ) - except Exception as _log_e: - logging.debug("[DEBUG] response logging failed: %s", _log_e) + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=bytes_payload, + ) return bytes_payload else: try: @@ -780,45 +795,39 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): extracted_price = cfg.price_extractor(payload) if cfg.price_extractor else None operation_succeeded = True final_elapsed_seconds = int(time.monotonic() - start_time) - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=response_content_to_log, - ) - except Exception as _log_e: - logging.debug("[DEBUG] response logging failed: %s", _log_e) + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=response_content_to_log, + ) return payload except ProcessingInterrupted: logging.debug("Polling was interrupted by user") raise except (ClientError, OSError) as e: - if attempt <= cfg.max_retries: + if (attempt - rate_limit_attempts) <= cfg.max_retries: logging.warning( "Connection error calling %s %s. Retrying in %.2fs (%d/%d): %s", method, url, delay, - attempt, + attempt - rate_limit_attempts, cfg.max_retries, str(e), ) - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - request_headers=dict(payload_headers) if payload_headers else None, - request_params=dict(params) if params else None, - request_data=request_body_log, - error_message=f"{type(e).__name__}: {str(e)} (will retry)", - ) - except Exception as _log_e: - logging.debug("[DEBUG] request error logging failed: %s", _log_e) + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) await sleep_with_interrupt( delay, cfg.node_cls, @@ -831,23 +840,6 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): continue diag = await _diagnose_connectivity() if not diag["internet_accessible"]: - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - request_headers=dict(payload_headers) if payload_headers else None, - request_params=dict(params) if params else None, - request_data=request_body_log, - error_message=f"LocalNetworkError: {str(e)}", - ) - except Exception as _log_e: - logging.debug("[DEBUG] final error logging failed: %s", _log_e) - raise LocalNetworkError( - "Unable to connect to the API server due to local network issues. " - "Please check your internet connection and try again." - ) from e - try: request_logger.log_request_response( operation_id=operation_id, request_method=method, @@ -855,10 +847,21 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): request_headers=dict(payload_headers) if payload_headers else None, request_params=dict(params) if params else None, request_data=request_body_log, - error_message=f"ApiServerError: {str(e)}", + error_message=f"LocalNetworkError: {str(e)}", ) - except Exception as _log_e: - logging.debug("[DEBUG] final error logging failed: %s", _log_e) + raise LocalNetworkError( + "Unable to connect to the API server due to local network issues. " + "Please check your internet connection and try again." + ) from e + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"ApiServerError: {str(e)}", + ) raise ApiServerError( f"The API server at {default_base_url()} is currently unreachable. " f"The service may be experiencing issues." diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index 78bcf1fa1..aa588d038 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -167,27 +167,25 @@ async def download_url_to_bytesio( with contextlib.suppress(Exception): dest.seek(0) - with contextlib.suppress(Exception): - request_logger.log_request_response( - operation_id=op_id, - request_method="GET", - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=f"[streamed {written} bytes to dest]", - ) + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=f"[streamed {written} bytes to dest]", + ) return except asyncio.CancelledError: raise ProcessingInterrupted("Task cancelled") from None except (ClientError, OSError) as e: if attempt <= max_retries: - with contextlib.suppress(Exception): - request_logger.log_request_response( - operation_id=op_id, - request_method="GET", - request_url=url, - error_message=f"{type(e).__name__}: {str(e)} (will retry)", - ) + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) await sleep_with_interrupt(delay, cls, None, None, None) delay *= retry_backoff continue diff --git a/comfy_api_nodes/util/request_logger.py b/comfy_api_nodes/util/request_logger.py index e0cb4428d..fe0543d9b 100644 --- a/comfy_api_nodes/util/request_logger.py +++ b/comfy_api_nodes/util/request_logger.py @@ -8,7 +8,6 @@ from typing import Any import folder_paths -# Get the logger instance logger = logging.getLogger(__name__) @@ -91,38 +90,41 @@ def log_request_response( Filenames are sanitized and length-limited for cross-platform safety. If we still fail to write, we fall back to appending into api.log. """ - log_dir = get_log_directory() - filepath = _build_log_filepath(log_dir, operation_id, request_url) - - log_content: list[str] = [] - log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}") - log_content.append(f"Operation ID: {operation_id}") - log_content.append("-" * 30 + " REQUEST " + "-" * 30) - log_content.append(f"Method: {request_method}") - log_content.append(f"URL: {request_url}") - if request_headers: - log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}") - if request_params: - log_content.append(f"Params:\n{_format_data_for_logging(request_params)}") - if request_data is not None: - log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}") - - log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30) - if response_status_code is not None: - log_content.append(f"Status Code: {response_status_code}") - if response_headers: - log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}") - if response_content is not None: - log_content.append(f"Content:\n{_format_data_for_logging(response_content)}") - if error_message: - log_content.append(f"Error:\n{error_message}") - try: - with open(filepath, "w", encoding="utf-8") as f: - f.write("\n".join(log_content)) - logger.debug("API log saved to: %s", filepath) - except Exception as e: - logger.error("Error writing API log to %s: %s", filepath, str(e)) + log_dir = get_log_directory() + filepath = _build_log_filepath(log_dir, operation_id, request_url) + + log_content: list[str] = [] + log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}") + log_content.append(f"Operation ID: {operation_id}") + log_content.append("-" * 30 + " REQUEST " + "-" * 30) + log_content.append(f"Method: {request_method}") + log_content.append(f"URL: {request_url}") + if request_headers: + log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}") + if request_params: + log_content.append(f"Params:\n{_format_data_for_logging(request_params)}") + if request_data is not None: + log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}") + + log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30) + if response_status_code is not None: + log_content.append(f"Status Code: {response_status_code}") + if response_headers: + log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}") + if response_content is not None: + log_content.append(f"Content:\n{_format_data_for_logging(response_content)}") + if error_message: + log_content.append(f"Error:\n{error_message}") + + try: + with open(filepath, "w", encoding="utf-8") as f: + f.write("\n".join(log_content)) + logger.debug("API log saved to: %s", filepath) + except Exception as e: + logger.error("Error writing API log to %s: %s", filepath, str(e)) + except Exception as _log_e: + logging.debug("[DEBUG] log_request_response failed: %s", _log_e) if __name__ == '__main__': diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index 83d936ce1..7cc565263 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -255,17 +255,14 @@ async def upload_file( monitor_task = asyncio.create_task(_monitor()) sess: aiohttp.ClientSession | None = None try: - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - request_headers=headers or None, - request_params=None, - request_data=f"[File data {len(data)} bytes]", - ) - except Exception as e: - logging.debug("[DEBUG] upload request logging failed: %s", e) + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_params=None, + request_data=f"[File data {len(data)} bytes]", + ) sess = aiohttp.ClientSession(timeout=timeout) req = sess.put(upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers) @@ -311,31 +308,27 @@ async def upload_file( delay *= retry_backoff continue raise Exception(f"Failed to upload (HTTP {resp.status}).") - try: - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content="File uploaded successfully.", - ) - except Exception as e: - logging.debug("[DEBUG] upload response logging failed: %s", e) + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content="File uploaded successfully.", + ) return except asyncio.CancelledError: raise ProcessingInterrupted("Task cancelled") from None except (aiohttp.ClientError, OSError) as e: if attempt <= max_retries: - with contextlib.suppress(Exception): - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - request_headers=headers or None, - request_data=f"[File data {len(data)} bytes]", - error_message=f"{type(e).__name__}: {str(e)} (will retry)", - ) + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_data=f"[File data {len(data)} bytes]", + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) await sleep_with_interrupt( delay, cls, From 117e2143543dd649d47345e183748a82d48d12d3 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:51:50 -0800 Subject: [PATCH 17/25] ModelPatcherDynamic: force load non leaf weights (#12433) The current behaviour of the default ModelPatcher is to .to a model only if its fully loaded, which is how random non-leaf weights get loaded in non-LowVRAM conditions. The however means they never get loaded in dynamic_vram. In the dynamic_vram case, force load them to the GPU. --- comfy/model_patcher.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index f278fccac..b1d907ba4 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -679,18 +679,19 @@ class ModelPatcher: for key in list(self.pinned): self.unpin_weight(key) - def _load_list(self, prio_comfy_cast_weights=False): + def _load_list(self, prio_comfy_cast_weights=False, default_device=None): loading = [] for n, m in self.model.named_modules(): - params = [] - skip = False - for name, param in m.named_parameters(recurse=False): - params.append(name) + default = False + params = { name: param for name, param in m.named_parameters(recurse=False) } for name, param in m.named_parameters(recurse=True): if name not in params: - skip = True # skip random weights in non leaf modules + default = True # default random weights in non leaf modules break - if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0): + if default and default_device is not None: + for param in params.values(): + param.data = param.data.to(device=default_device) + if not default and (hasattr(m, "comfy_cast_weights") or len(params) > 0): module_mem = comfy.model_management.module_size(m) module_offload_mem = module_mem if hasattr(m, "comfy_cast_weights"): @@ -1495,7 +1496,7 @@ class ModelPatcherDynamic(ModelPatcher): #with pin and unpin syncrhonization which can be expensive for small weights #with a high layer rate (e.g. autoregressive LLMs). #prioritize the non-comfy weights (note the order reverse). - loading = self._load_list(prio_comfy_cast_weights=True) + loading = self._load_list(prio_comfy_cast_weights=True, default_device=device_to) loading.sort(reverse=True) for x in loading: @@ -1579,7 +1580,7 @@ class ModelPatcherDynamic(ModelPatcher): return 0 if vbar is None else vbar.free_memory(memory_to_free) def partially_unload_ram(self, ram_to_unload): - loading = self._load_list(prio_comfy_cast_weights=True) + loading = self._load_list(prio_comfy_cast_weights=True, default_device=self.offload_device) for x in loading: _, _, _, _, m, _ = x ram_to_unload -= comfy.pinned_memory.unpin_memory(m) From ae79e33345dc893f8e0632c380c0e91dc09ac6e8 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:56:42 -0800 Subject: [PATCH 18/25] llama: use a more efficient rope implementation (#12434) Get rid of the cat and unary negation and inplace add-cmul the two halves of the rope. Precompute -sin once at the start of the model rather than every transformer block. This is slightly faster on both GPU and CPU bound setups. --- comfy/text_encoders/llama.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index b6735d210..54f3d5595 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -355,13 +355,6 @@ class RMSNorm(nn.Module): -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None): if not isinstance(theta, list): theta = [theta] @@ -390,20 +383,30 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di else: cos = cos.unsqueeze(1) sin = sin.unsqueeze(1) - out.append((cos, sin)) + sin_split = sin.shape[-1] // 2 + out.append((cos, sin[..., : sin_split], -sin[..., sin_split :])) if len(out) == 1: return out[0] return out - def apply_rope(xq, xk, freqs_cis): org_dtype = xq.dtype cos = freqs_cis[0] sin = freqs_cis[1] - q_embed = (xq * cos) + (rotate_half(xq) * sin) - k_embed = (xk * cos) + (rotate_half(xk) * sin) + nsin = freqs_cis[2] + + q_embed = (xq * cos) + q_split = q_embed.shape[-1] // 2 + q_embed[..., : q_split].addcmul_(xq[..., q_split :], nsin) + q_embed[..., q_split :].addcmul_(xq[..., : q_split], sin) + + k_embed = (xk * cos) + k_split = k_embed.shape[-1] // 2 + k_embed[..., : k_split].addcmul_(xk[..., k_split :], nsin) + k_embed[..., k_split :].addcmul_(xk[..., : k_split], sin) + return q_embed.to(org_dtype), k_embed.to(org_dtype) From e03fe8b5919a23a473cea6e53f916f7403c082a5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 12 Feb 2026 20:29:12 -0800 Subject: [PATCH 19/25] Update command to install AMD stable linux pytorch. (#12437) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 96dc2904b..3ccdc9c19 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ Put your VAE in: models/vae AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1``` This is the command to install the nightly with ROCm 7.1 which might have some performance improvements: From 8902907d7ab949ce42dd9b658b4a4582ed9fb630 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:29:37 -0800 Subject: [PATCH 20/25] dynamic_vram: Training fixes (#12442) --- comfy/model_patcher.py | 4 ++++ comfy_extras/nodes_train.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index b1d907ba4..67dce088e 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1561,6 +1561,8 @@ class ModelPatcherDynamic(ModelPatcher): allocated_size += weight_size vbar.set_watermark_limit(allocated_size) + move_weight_functions(m, device_to) + logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.") self.model.device = device_to @@ -1601,6 +1603,8 @@ class ModelPatcherDynamic(ModelPatcher): if unpatch_weights: self.partially_unload_ram(1e32) self.partially_unload(None, 1e32) + for m in self.model.modules(): + move_weight_functions(m, device_to) def partially_load(self, device_to, extra_memory=0, force_patch_weights=False): assert not force_patch_weights #See above diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 630eedc9f..aa2d88673 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -1035,7 +1035,7 @@ class TrainLoraNode(io.ComfyNode): io.Boolean.Input( "offloading", default=False, - tooltip="Depth level for gradient checkpointing.", + tooltip="Offload the Model to RAM. Requires Bypass Mode.", ), io.Combo.Input( "existing_lora", @@ -1124,6 +1124,15 @@ class TrainLoraNode(io.ComfyNode): lora_dtype = node_helpers.string_to_torch_dtype(lora_dtype) mp.set_model_compute_dtype(dtype) + if mp.is_dynamic(): + if not bypass_mode: + logging.info("Training MP is Dynamic - forcing bypass mode. Start comfy with --highvram to force weight diff mode") + bypass_mode = True + offloading = True + elif offloading: + if not bypass_mode: + logging.info("Training Offload selected - forcing bypass mode. Set bypass = True to remove this message") + # Prepare latents and compute counts latents, num_images, multi_res = _prepare_latents_and_count( latents, dtype, bucket_mode From e1add563f9e89026e8c4e8825a2b279fbd67d23a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:35:13 -0800 Subject: [PATCH 21/25] Use torch RMSNorm for flux models and refactor hunyuan video code. (#12432) --- comfy/controlnet.py | 1 + comfy/ldm/chroma/layers.py | 3 +- comfy/ldm/chroma_radiance/layers.py | 8 ++--- comfy/ldm/flux/layers.py | 53 +++++++---------------------- comfy/ldm/flux/model.py | 3 +- comfy/ldm/hunyuan_video/model.py | 11 +++--- comfy/lora_convert.py | 2 +- comfy/model_detection.py | 18 +++++++--- comfy/supported_models.py | 32 +++++++++++++++-- comfy/utils.py | 12 +++---- 10 files changed, 74 insertions(+), 69 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 9e1e704e0..8336412f2 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -560,6 +560,7 @@ def load_controlnet_hunyuandit(controlnet_data, model_options={}): def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}): model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options) control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config) + sd = model_config.process_unet_state_dict(sd) control_model = controlnet_load_state_dict(control_model, sd) extra_conds = ['y', 'guidance'] control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index 2d5684348..df348a8ed 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -3,7 +3,6 @@ from torch import Tensor, nn from comfy.ldm.flux.layers import ( MLPEmbedder, - RMSNorm, ModulationOut, ) @@ -29,7 +28,7 @@ class Approximator(nn.Module): super().__init__() self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device) self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)]) - self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)]) + self.norms = nn.ModuleList([operations.RMSNorm(hidden_dim, dtype=dtype, device=device) for x in range( n_layers)]) self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) @property diff --git a/comfy/ldm/chroma_radiance/layers.py b/comfy/ldm/chroma_radiance/layers.py index 3c7bc9b6b..08d31e0ba 100644 --- a/comfy/ldm/chroma_radiance/layers.py +++ b/comfy/ldm/chroma_radiance/layers.py @@ -4,8 +4,6 @@ from functools import lru_cache import torch from torch import nn -from comfy.ldm.flux.layers import RMSNorm - class NerfEmbedder(nn.Module): """ @@ -145,7 +143,7 @@ class NerfGLUBlock(nn.Module): # We now need to generate parameters for 3 matrices. total_params = 3 * hidden_size_x**2 * mlp_ratio self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device) - self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations) + self.norm = operations.RMSNorm(hidden_size_x, dtype=dtype, device=device) self.mlp_ratio = mlp_ratio @@ -178,7 +176,7 @@ class NerfGLUBlock(nn.Module): class NerfFinalLayer(nn.Module): def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None): super().__init__() - self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations) + self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device) self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -190,7 +188,7 @@ class NerfFinalLayer(nn.Module): class NerfFinalLayerConv(nn.Module): def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None): super().__init__() - self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations) + self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device) self.conv = operations.Conv2d( in_channels=hidden_size, out_channels=out_channels, diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 60f2bdae2..1f2975fb1 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -5,8 +5,6 @@ import torch from torch import Tensor, nn from .math import attention, rope -import comfy.ops -import comfy.ldm.common_dit class EmbedND(nn.Module): @@ -87,20 +85,12 @@ def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dt operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), ) -class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, dtype=None, device=None, operations=None): - super().__init__() - self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device)) - - def forward(self, x: Tensor): - return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6) - class QKNorm(torch.nn.Module): def __init__(self, dim: int, dtype=None, device=None, operations=None): super().__init__() - self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations) - self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations) + self.query_norm = operations.RMSNorm(dim, dtype=dtype, device=device) + self.key_norm = operations.RMSNorm(dim, dtype=dtype, device=device) def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple: q = self.query_norm(q) @@ -169,7 +159,7 @@ class SiLUActivation(nn.Module): class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) @@ -197,8 +187,6 @@ class DoubleStreamBlock(nn.Module): self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations) - self.flipped_img_txt = flipped_img_txt - def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}): if self.modulation: img_mod1, img_mod2 = self.img_mod(vec) @@ -224,32 +212,17 @@ class DoubleStreamBlock(nn.Module): del txt_qkv txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) - if self.flipped_img_txt: - q = torch.cat((img_q, txt_q), dim=2) - del img_q, txt_q - k = torch.cat((img_k, txt_k), dim=2) - del img_k, txt_k - v = torch.cat((img_v, txt_v), dim=2) - del img_v, txt_v - # run actual attention - attn = attention(q, k, v, - pe=pe, mask=attn_mask, transformer_options=transformer_options) - del q, k, v + q = torch.cat((txt_q, img_q), dim=2) + del txt_q, img_q + k = torch.cat((txt_k, img_k), dim=2) + del txt_k, img_k + v = torch.cat((txt_v, img_v), dim=2) + del txt_v, img_v + # run actual attention + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v - img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:] - else: - q = torch.cat((txt_q, img_q), dim=2) - del txt_q, img_q - k = torch.cat((txt_k, img_k), dim=2) - del txt_k, img_k - v = torch.cat((txt_v, img_v), dim=2) - del txt_v, img_v - # run actual attention - attn = attention(q, k, v, - pe=pe, mask=attn_mask, transformer_options=transformer_options) - del q, k, v - - txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] # calculate the img bloks img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index f40c2a7a9..260ccad7e 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -16,7 +16,6 @@ from .layers import ( SingleStreamBlock, timestep_embedding, Modulation, - RMSNorm ) @dataclass @@ -81,7 +80,7 @@ class Flux(nn.Module): self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device) if params.txt_norm: - self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations) + self.txt_norm = operations.RMSNorm(params.context_in_dim, dtype=dtype, device=device) else: self.txt_norm = None diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 55ab550f8..563f28f6b 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -241,7 +241,6 @@ class HunyuanVideo(nn.Module): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, - flipped_img_txt=True, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -378,14 +377,14 @@ class HunyuanVideo(nn.Module): extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype) txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1) - ids = torch.cat((img_ids, txt_ids), dim=1) + ids = torch.cat((txt_ids, img_ids), dim=1) pe = self.pe_embedder(ids) img_len = img.shape[1] if txt_mask is not None: attn_mask_len = img_len + txt.shape[1] attn_mask = torch.zeros((1, 1, attn_mask_len), dtype=img.dtype, device=img.device) - attn_mask[:, 0, img_len:] = txt_mask + attn_mask[:, 0, :txt.shape[1]] = txt_mask else: attn_mask = None @@ -413,7 +412,7 @@ class HunyuanVideo(nn.Module): if add is not None: img += add - img = torch.cat((img, txt), 1) + img = torch.cat((txt, img), 1) transformer_options["total_blocks"] = len(self.single_blocks) transformer_options["block_type"] = "single" @@ -435,9 +434,9 @@ class HunyuanVideo(nn.Module): if i < len(control_o): add = control_o[i] if add is not None: - img[:, : img_len] += add + img[:, txt.shape[1]: img_len + txt.shape[1]] += add - img = img[:, : img_len] + img = img[:, txt.shape[1]: img_len + txt.shape[1]] if ref_latent is not None: img = img[:, ref_latent.shape[1]:] diff --git a/comfy/lora_convert.py b/comfy/lora_convert.py index 9d8d21efe..749e81df3 100644 --- a/comfy/lora_convert.py +++ b/comfy/lora_convert.py @@ -5,7 +5,7 @@ import comfy.utils def convert_lora_bfl_control(sd): #BFL loras for Flux sd_out = {} for k in sd: - k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.scale.set_weight")) + k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.set_weight")) sd_out[k_to] = sd[k] sd_out["diffusion_model.img_in.reshape_weight"] = torch.tensor([sd["img_in.lora_B.weight"].shape[0], sd["img_in.lora_A.weight"].shape[1]]) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index e8ad725df..30ea03e8e 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -19,6 +19,12 @@ def count_blocks(state_dict_keys, prefix_string): count += 1 return count +def any_suffix_in(keys, prefix, main, suffix_list=[]): + for x in suffix_list: + if "{}{}{}".format(prefix, main, x) in keys: + return True + return False + def calculate_transformer_depth(prefix, state_dict_keys, state_dict): context_dim = None use_linear_in_transformer = False @@ -186,7 +192,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["meanflow_sum"] = False return dit_config - if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) + if any_suffix_in(state_dict_keys, key_prefix, 'double_blocks.0.img_attn.norm.key_norm.', ["weight", "scale"]) and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"])): #Flux, Chroma or Chroma Radiance (has no img_in.weight) dit_config = {} if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys: dit_config["image_model"] = "flux2" @@ -241,7 +247,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') - if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma + + if any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.0.norms.0.', ["weight", "scale"]) or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"]): #Chroma dit_config["image_model"] = "chroma" dit_config["in_channels"] = 64 dit_config["out_channels"] = 64 @@ -249,7 +256,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["out_dim"] = 3072 dit_config["hidden_dim"] = 5120 dit_config["n_layers"] = 5 - if f"{key_prefix}nerf_blocks.0.norm.scale" in state_dict_keys: #Chroma Radiance + + if any_suffix_in(state_dict_keys, key_prefix, 'nerf_blocks.0.norm.', ["weight", "scale"]): #Chroma Radiance dit_config["image_model"] = "chroma_radiance" dit_config["in_channels"] = 3 dit_config["out_channels"] = 3 @@ -259,7 +267,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_depth"] = 4 dit_config["nerf_max_freqs"] = 8 dit_config["nerf_tile_size"] = 512 - dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" + dit_config["nerf_final_head_type"] = "conv" if any_suffix_in(state_dict_keys, key_prefix, 'nerf_final_layer_conv.norm.', ["weight", "scale"]) else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 if "{}__x0__".format(key_prefix) in state_dict_keys: # x0 pred dit_config["use_x0"] = True @@ -268,7 +276,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys - dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys + dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"]) if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model dit_config["txt_ids_dims"] = [1, 2] diff --git a/comfy/supported_models.py b/comfy/supported_models.py index d33db7507..c28be1716 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -710,6 +710,15 @@ class Flux(supported_models_base.BASE): supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + def process_unet_state_dict(self, state_dict): + out_sd = {} + for k in list(state_dict.keys()): + key_out = k + if key_out.endswith("_norm.scale"): + key_out = "{}.weight".format(key_out[:-len(".scale")]) + out_sd[key_out] = state_dict[k] + return out_sd + vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] @@ -898,11 +907,13 @@ class HunyuanVideo(supported_models_base.BASE): key_out = key_out.replace("txt_in.c_embedder.linear_1.", "txt_in.c_embedder.in_layer.").replace("txt_in.c_embedder.linear_2.", "txt_in.c_embedder.out_layer.") key_out = key_out.replace("_mod.linear.", "_mod.lin.").replace("_attn_qkv.", "_attn.qkv.") key_out = key_out.replace("mlp.fc1.", "mlp.0.").replace("mlp.fc2.", "mlp.2.") - key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.scale").replace("_attn_k_norm.weight", "_attn.norm.key_norm.scale") - key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.scale").replace(".k_norm.weight", ".norm.key_norm.scale") + key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.weight").replace("_attn_k_norm.weight", "_attn.norm.key_norm.weight") + key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.weight").replace(".k_norm.weight", ".norm.key_norm.weight") key_out = key_out.replace("_attn_proj.", "_attn.proj.") key_out = key_out.replace(".modulation.linear.", ".modulation.lin.") key_out = key_out.replace("_in.mlp.2.", "_in.out_layer.").replace("_in.mlp.0.", "_in.in_layer.") + if key_out.endswith(".scale"): + key_out = "{}.weight".format(key_out[:-len(".scale")]) out_sd[key_out] = state_dict[k] return out_sd @@ -1264,6 +1275,15 @@ class Hunyuan3Dv2(supported_models_base.BASE): latent_format = latent_formats.Hunyuan3Dv2 + def process_unet_state_dict(self, state_dict): + out_sd = {} + for k in list(state_dict.keys()): + key_out = k + if key_out.endswith(".scale"): + key_out = "{}.weight".format(key_out[:-len(".scale")]) + out_sd[key_out] = state_dict[k] + return out_sd + def process_unet_state_dict_for_saving(self, state_dict): replace_prefix = {"": "model."} return utils.state_dict_prefix_replace(state_dict, replace_prefix) @@ -1341,6 +1361,14 @@ class Chroma(supported_models_base.BASE): supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + def process_unet_state_dict(self, state_dict): + out_sd = {} + for k in list(state_dict.keys()): + key_out = k + if key_out.endswith(".scale"): + key_out = "{}.weight".format(key_out[:-len(".scale")]) + out_sd[key_out] = state_dict[k] + return out_sd def get_model(self, state_dict, prefix="", device=None): out = model_base.Chroma(self, device=device) diff --git a/comfy/utils.py b/comfy/utils.py index e0a94e2e1..d553a7c1b 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -675,10 +675,10 @@ def flux_to_diffusers(mmdit_config, output_prefix=""): "ff_context.linear_in.bias": "txt_mlp.0.bias", "ff_context.linear_out.weight": "txt_mlp.2.weight", "ff_context.linear_out.bias": "txt_mlp.2.bias", - "attn.norm_q.weight": "img_attn.norm.query_norm.scale", - "attn.norm_k.weight": "img_attn.norm.key_norm.scale", - "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale", - "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale", + "attn.norm_q.weight": "img_attn.norm.query_norm.weight", + "attn.norm_k.weight": "img_attn.norm.key_norm.weight", + "attn.norm_added_q.weight": "txt_attn.norm.query_norm.weight", + "attn.norm_added_k.weight": "txt_attn.norm.key_norm.weight", } for k in block_map: @@ -701,8 +701,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""): "norm.linear.bias": "modulation.lin.bias", "proj_out.weight": "linear2.weight", "proj_out.bias": "linear2.bias", - "attn.norm_q.weight": "norm.query_norm.scale", - "attn.norm_k.weight": "norm.key_norm.scale", + "attn.norm_q.weight": "norm.query_norm.weight", + "attn.norm_k.weight": "norm.key_norm.weight", "attn.to_qkv_mlp_proj.weight": "linear1.weight", # Flux 2 "attn.to_out.weight": "linear2.weight", # Flux 2 } From 831351a29e91ea758437227c2f3c915a6be6d1a6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:15:23 -0800 Subject: [PATCH 22/25] Support generating attention masks for left padded text encoders. (#12454) --- comfy/sd1_clip.py | 15 +++++++++++---- comfy/text_encoders/ace15.py | 8 +------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 4c817d468..b564d1529 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -171,8 +171,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def process_tokens(self, tokens, device): end_token = self.special_tokens.get("end", None) + pad_token = self.special_tokens.get("pad", -1) if end_token is None: - cmp_token = self.special_tokens.get("pad", -1) + cmp_token = pad_token else: cmp_token = end_token @@ -186,15 +187,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): other_embeds = [] eos = False index = 0 + left_pad = False for y in x: if isinstance(y, numbers.Integral): - if eos: + token = int(y) + if index == 0 and token == pad_token: + left_pad = True + + if eos or (left_pad and token == pad_token): attention_mask.append(0) else: attention_mask.append(1) - token = int(y) + left_pad = False + tokens_temp += [token] - if not eos and token == cmp_token: + if not eos and token == cmp_token and not left_pad: if end_token is None: attention_mask[-1] = 0 eos = True diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py index 0fdd4669f..f135d74c1 100644 --- a/comfy/text_encoders/ace15.py +++ b/comfy/text_encoders/ace15.py @@ -10,7 +10,6 @@ import comfy.utils def sample_manual_loop_no_classes( model, ids=None, - paddings=[], execution_dtype=None, cfg_scale: float = 2.0, temperature: float = 0.85, @@ -36,9 +35,6 @@ def sample_manual_loop_no_classes( embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device) embeds_batch = embeds.shape[0] - for i, t in enumerate(paddings): - attention_mask[i, :t] = 0 - attention_mask[i, t:] = 1 output_audio_codes = [] past_key_values = [] @@ -135,13 +131,11 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102 pos_pad = (len(negative) - len(positive)) positive = [model.special_tokens["pad"]] * pos_pad + positive - paddings = [pos_pad, neg_pad] ids = [positive, negative] else: - paddings = [] ids = [positive] - return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) + return sample_manual_loop_no_classes(model, ids, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) class ACE15Tokenizer(sd1_clip.SD1Tokenizer): From 726af73867c18c5ca8b980a2c28401d77e5b365a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:21:10 -0800 Subject: [PATCH 23/25] Fix some custom nodes. (#12455) --- comfy/ldm/flux/layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 1f2975fb1..3518a1922 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -6,6 +6,8 @@ from torch import Tensor, nn from .math import attention, rope +# Fix import for some custom nodes, TODO: delete eventually. +RMSNorm = None class EmbedND(nn.Module): def __init__(self, dim: int, theta: int, axes_dim: list): From 712efb466b9379e6761802c44027783d37d96a87 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 13 Feb 2026 18:56:54 -0800 Subject: [PATCH 24/25] Add left padding to LTXAV text encoder. (#12456) --- comfy/text_encoders/lt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py index 3f87dfd6a..9cf87c0b2 100644 --- a/comfy/text_encoders/lt.py +++ b/comfy/text_encoders/lt.py @@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs): class Gemma3_12BTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer = tokenizer_data.get("spiece_model", None) - super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) + super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_left=True, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} @@ -97,6 +97,7 @@ class LTXAVTEModel(torch.nn.Module): token_weight_pairs = token_weight_pairs["gemma3_12b"] out, pooled, extra = self.gemma3_12b.encode_token_weights(token_weight_pairs) + out = out[:, :, -torch.sum(extra["attention_mask"]).item():] out_device = out.device if comfy.model_management.should_use_bf16(self.execution_device): out = out.to(device=self.execution_device, dtype=torch.bfloat16) @@ -138,6 +139,7 @@ class LTXAVTEModel(torch.nn.Module): token_weight_pairs = token_weight_pairs.get("gemma3_12b", []) num_tokens = sum(map(lambda a: len(a), token_weight_pairs)) + num_tokens = max(num_tokens, 64) return num_tokens * constant * 1024 * 1024 def ltxav_te(dtype_llama=None, llama_quantization_metadata=None): From dc9822b7df4785e77690e93b0e09feaff01e2e12 Mon Sep 17 00:00:00 2001 From: krigeta <75309361+krigeta@users.noreply.github.com> Date: Sat, 14 Feb 2026 08:53:52 +0530 Subject: [PATCH 25/25] Add working Qwen 2512 ControlNet (Fun ControlNet) support (#12359) --- comfy/controlnet.py | 73 +++++++++++ comfy/ldm/qwen_image/controlnet.py | 190 +++++++++++++++++++++++++++++ 2 files changed, 263 insertions(+) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 8336412f2..ba670b16d 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -297,6 +297,30 @@ class ControlNet(ControlBase): self.model_sampling_current = None super().cleanup() + +class QwenFunControlNet(ControlNet): + def get_control(self, x_noisy, t, cond, batched_number, transformer_options): + # Fun checkpoints are more sensitive to high strengths in the generic + # ControlNet merge path. Use a soft response curve so strength=1.0 stays + # unchanged while >1 grows more gently. + original_strength = self.strength + self.strength = math.sqrt(max(self.strength, 0.0)) + try: + return super().get_control(x_noisy, t, cond, batched_number, transformer_options) + finally: + self.strength = original_strength + + def pre_run(self, model, percent_to_timestep_function): + super().pre_run(model, percent_to_timestep_function) + self.set_extra_arg("base_model", model.diffusion_model) + + def copy(self): + c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype) + c.control_model = self.control_model + c.control_model_wrapped = self.control_model_wrapped + self.copy_to(c) + return c + class ControlLoraOps: class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp): def __init__(self, in_features: int, out_features: int, bias: bool = True, @@ -606,6 +630,53 @@ def load_controlnet_qwen_instantx(sd, model_options={}): control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) return control + +def load_controlnet_qwen_fun(sd, model_options={}): + load_device = comfy.model_management.get_torch_device() + weight_dtype = comfy.utils.weight_dtype(sd) + unet_dtype = model_options.get("dtype", weight_dtype) + manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device) + + operations = model_options.get("custom_operations", None) + if operations is None: + operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True) + + in_features = sd["control_img_in.weight"].shape[1] + inner_dim = sd["control_img_in.weight"].shape[0] + + block_weight = sd["control_blocks.0.attn.to_q.weight"] + attention_head_dim = sd["control_blocks.0.attn.norm_q.weight"].shape[0] + num_attention_heads = max(1, block_weight.shape[0] // max(1, attention_head_dim)) + + model = comfy.ldm.qwen_image.controlnet.QwenImageFunControlNetModel( + control_in_features=in_features, + inner_dim=inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + num_control_blocks=5, + main_model_double=60, + injection_layers=(0, 12, 24, 36, 48), + operations=operations, + device=comfy.model_management.unet_offload_device(), + dtype=unet_dtype, + ) + model = controlnet_load_state_dict(model, sd) + + latent_format = comfy.latent_formats.Wan21() + control = QwenFunControlNet( + model, + compression_ratio=1, + latent_format=latent_format, + # Fun checkpoints already expect their own 33-channel context handling. + # Enabling generic concat_mask injects an extra mask channel at apply-time + # and breaks the intended fallback packing path. + concat_mask=False, + load_device=load_device, + manual_cast_dtype=manual_cast_dtype, + extra_conds=[], + ) + return control + def convert_mistoline(sd): return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."}) @@ -683,6 +754,8 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}): return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options) elif "controlnet_x_embedder.weight" in controlnet_data: return load_controlnet_flux_instantx(controlnet_data, model_options=model_options) + elif "control_blocks.0.after_proj.weight" in controlnet_data and "control_img_in.weight" in controlnet_data: + return load_controlnet_qwen_fun(controlnet_data, model_options=model_options) elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options) diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py index a6d408104..c0aae9240 100644 --- a/comfy/ldm/qwen_image/controlnet.py +++ b/comfy/ldm/qwen_image/controlnet.py @@ -2,6 +2,196 @@ import torch import math from .model import QwenImageTransformer2DModel +from .model import QwenImageTransformerBlock + + +class QwenImageFunControlBlock(QwenImageTransformerBlock): + def __init__(self, dim, num_attention_heads, attention_head_dim, has_before_proj=False, dtype=None, device=None, operations=None): + super().__init__( + dim=dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + dtype=dtype, + device=device, + operations=operations, + ) + self.has_before_proj = has_before_proj + if has_before_proj: + self.before_proj = operations.Linear(dim, dim, device=device, dtype=dtype) + self.after_proj = operations.Linear(dim, dim, device=device, dtype=dtype) + + +class QwenImageFunControlNetModel(torch.nn.Module): + def __init__( + self, + control_in_features=132, + inner_dim=3072, + num_attention_heads=24, + attention_head_dim=128, + num_control_blocks=5, + main_model_double=60, + injection_layers=(0, 12, 24, 36, 48), + dtype=None, + device=None, + operations=None, + ): + super().__init__() + self.dtype = dtype + self.main_model_double = main_model_double + self.injection_layers = tuple(injection_layers) + # Keep base hint scaling at 1.0 so user-facing strength behaves similarly + # to the reference Gen2/VideoX implementation around strength=1. + self.hint_scale = 1.0 + self.control_img_in = operations.Linear(control_in_features, inner_dim, device=device, dtype=dtype) + + self.control_blocks = torch.nn.ModuleList([]) + for i in range(num_control_blocks): + self.control_blocks.append( + QwenImageFunControlBlock( + dim=inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + has_before_proj=(i == 0), + dtype=dtype, + device=device, + operations=operations, + ) + ) + + def _process_hint_tokens(self, hint): + if hint is None: + return None + if hint.ndim == 4: + hint = hint.unsqueeze(2) + + # Fun checkpoints are trained with 33 latent channels before 2x2 packing: + # [control_latent(16), mask(1), inpaint_latent(16)] -> 132 features. + # Default behavior (no inpaint input in stock Apply ControlNet) should use + # zeros for mask/inpaint branches, matching VideoX fallback semantics. + expected_c = self.control_img_in.weight.shape[1] // 4 + if hint.shape[1] == 16 and expected_c == 33: + zeros_mask = torch.zeros_like(hint[:, :1]) + zeros_inpaint = torch.zeros_like(hint) + hint = torch.cat([hint, zeros_mask, zeros_inpaint], dim=1) + + bs, c, t, h, w = hint.shape + hidden_states = torch.nn.functional.pad(hint, (0, w % 2, 0, h % 2)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view( + orig_shape[0], + orig_shape[1], + orig_shape[-3], + orig_shape[-2] // 2, + 2, + orig_shape[-1] // 2, + 2, + ) + hidden_states = hidden_states.permute(0, 2, 3, 5, 1, 4, 6) + hidden_states = hidden_states.reshape( + bs, + t * ((h + 1) // 2) * ((w + 1) // 2), + c * 4, + ) + + expected_in = self.control_img_in.weight.shape[1] + cur_in = hidden_states.shape[-1] + if cur_in < expected_in: + pad = torch.zeros( + (hidden_states.shape[0], hidden_states.shape[1], expected_in - cur_in), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + hidden_states = torch.cat([hidden_states, pad], dim=-1) + elif cur_in > expected_in: + hidden_states = hidden_states[:, :, :expected_in] + + return hidden_states + + def forward( + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + hint=None, + transformer_options={}, + base_model=None, + **kwargs, + ): + if base_model is None: + raise RuntimeError("Qwen Fun ControlNet requires a QwenImage base model at runtime.") + + encoder_hidden_states_mask = attention_mask + # Keep attention mask disabled inside Fun control blocks to mirror + # VideoX behavior (they rely on seq lengths for RoPE, not masked attention). + encoder_hidden_states_mask = None + + hidden_states, img_ids, _ = base_model.process_img(x) + hint_tokens = self._process_hint_tokens(hint) + if hint_tokens is None: + raise RuntimeError("Qwen Fun ControlNet requires a control hint image.") + + if hint_tokens.shape[1] != hidden_states.shape[1]: + max_tokens = min(hint_tokens.shape[1], hidden_states.shape[1]) + hint_tokens = hint_tokens[:, :max_tokens] + hidden_states = hidden_states[:, :max_tokens] + img_ids = img_ids[:, :max_tokens] + + txt_start = round( + max( + ((x.shape[-1] + (base_model.patch_size // 2)) // base_model.patch_size) // 2, + ((x.shape[-2] + (base_model.patch_size // 2)) // base_model.patch_size) // 2, + ) + ) + txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + image_rotary_emb = base_model.pe_embedder(ids).to(x.dtype).contiguous() + + hidden_states = base_model.img_in(hidden_states) + encoder_hidden_states = base_model.txt_norm(context) + encoder_hidden_states = base_model.txt_in(encoder_hidden_states) + + if guidance is not None: + guidance = guidance * 1000 + + temb = ( + base_model.time_text_embed(timesteps, hidden_states) + if guidance is None + else base_model.time_text_embed(timesteps, guidance, hidden_states) + ) + + c = self.control_img_in(hint_tokens) + + for i, block in enumerate(self.control_blocks): + if i == 0: + c_in = block.before_proj(c) + hidden_states + all_c = [] + else: + all_c = list(torch.unbind(c, dim=0)) + c_in = all_c.pop(-1) + + encoder_hidden_states, c_out = block( + hidden_states=c_in, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + transformer_options=transformer_options, + ) + + c_skip = block.after_proj(c_out) * self.hint_scale + all_c += [c_skip, c_out] + c = torch.stack(all_c, dim=0) + + hints = torch.unbind(c, dim=0)[:-1] + + controlnet_block_samples = [None] * self.main_model_double + for local_idx, base_idx in enumerate(self.injection_layers): + if local_idx < len(hints) and base_idx < len(controlnet_block_samples): + controlnet_block_samples[base_idx] = hints[local_idx] + + return {"input": controlnet_block_samples} class QwenImageControlNetModel(QwenImageTransformer2DModel):