From 1b96fae1d4a8425c44d4d3bd60acd818d05bf4f6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 19 Jul 2025 01:55:23 -0700
Subject: [PATCH 01/24] Add nested style of dual cfg to DualCFGGuider node.
 (#8965)

---
 comfy_extras/nodes_custom_sampler.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index 33bc41842..d17737e1a 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -683,9 +683,10 @@ class CFGGuider:
         return (guider,)
 
 class Guider_DualCFG(comfy.samplers.CFGGuider):
-    def set_cfg(self, cfg1, cfg2):
+    def set_cfg(self, cfg1, cfg2, nested=False):
         self.cfg1 = cfg1
         self.cfg2 = cfg2
+        self.nested = nested
 
     def set_conds(self, positive, middle, negative):
         middle = node_helpers.conditioning_set_values(middle, {"prompt_type": "negative"})
@@ -695,14 +696,20 @@ class Guider_DualCFG(comfy.samplers.CFGGuider):
         negative_cond = self.conds.get("negative", None)
         middle_cond = self.conds.get("middle", None)
         positive_cond = self.conds.get("positive", None)
-        if model_options.get("disable_cfg1_optimization", False) == False:
-            if math.isclose(self.cfg2, 1.0):
-                negative_cond = None
-                if math.isclose(self.cfg1, 1.0):
-                    middle_cond = None
 
-        out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options)
-        return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1
+        if self.nested:
+            out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options)
+            pred_text = comfy.samplers.cfg_function(self.inner_model, out[2], out[1], self.cfg1, x, timestep, model_options=model_options, cond=positive_cond, uncond=middle_cond)
+            return out[0] + self.cfg2 * (pred_text - out[0])
+        else:
+            if model_options.get("disable_cfg1_optimization", False) == False:
+                if math.isclose(self.cfg2, 1.0):
+                    negative_cond = None
+                    if math.isclose(self.cfg1, 1.0):
+                        middle_cond = None
+
+            out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options)
+            return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1
 
 class DualCFGGuider:
     @classmethod
@@ -714,6 +721,7 @@ class DualCFGGuider:
                     "negative": ("CONDITIONING", ),
                     "cfg_conds": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
                     "cfg_cond2_negative": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
+                    "style": (["regular", "nested"],),
                      }
                 }
 
@@ -722,10 +730,10 @@ class DualCFGGuider:
     FUNCTION = "get_guider"
     CATEGORY = "sampling/custom_sampling/guiders"
 
-    def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative):
+    def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style):
         guider = Guider_DualCFG(model)
         guider.set_conds(cond1, cond2, negative)
-        guider.set_cfg(cfg_conds, cfg_cond2_negative)
+        guider.set_cfg(cfg_conds, cfg_cond2_negative, nested=(style == "nested"))
         return (guider,)
 
 class DisableNoise:

From 1da5639e865a50f921d870a92c7c87110ce20c48 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Sat, 19 Jul 2025 18:08:00 +0800
Subject: [PATCH 02/24] Update template to 0.1.37 (#8967)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7705918a8..a7e44095f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.23.4
-comfyui-workflow-templates==0.1.36
+comfyui-workflow-templates==0.1.37
 comfyui-embedded-docs==0.2.4
 torch
 torchsde

From 100c2478eaba71ab735539fdc00c9d0de49bc224 Mon Sep 17 00:00:00 2001
From: chaObserv <154517000+chaObserv@users.noreply.github.com>
Date: Sun, 20 Jul 2025 11:09:11 +0800
Subject: [PATCH 03/24] Add SamplingPercentToSigma node (#8963)

It's helpful to adjust start_percent or end_percent based on the corresponding sigma.
---
 comfy_extras/nodes_custom_sampler.py | 30 ++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index d17737e1a..d011f433b 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -301,6 +301,35 @@ class ExtendIntermediateSigmas:
 
         return (extended_sigmas,)
 
+
+class SamplingPercentToSigma:
+    @classmethod
+    def INPUT_TYPES(cls) -> InputTypeDict:
+        return {
+            "required": {
+                "model": (IO.MODEL, {}),
+                "sampling_percent": (IO.FLOAT, {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.0001}),
+                "return_actual_sigma": (IO.BOOLEAN, {"default": False, "tooltip": "Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."}),
+            }
+        }
+
+    RETURN_TYPES = (IO.FLOAT,)
+    RETURN_NAMES = ("sigma_value",)
+    CATEGORY = "sampling/custom_sampling/sigmas"
+
+    FUNCTION = "get_sigma"
+
+    def get_sigma(self, model, sampling_percent, return_actual_sigma):
+        model_sampling = model.get_model_object("model_sampling")
+        sigma_val = model_sampling.percent_to_sigma(sampling_percent)
+        if return_actual_sigma:
+            if sampling_percent == 0.0:
+                sigma_val = model_sampling.sigma_max.item()
+            elif sampling_percent == 1.0:
+                sigma_val = model_sampling.sigma_min.item()
+        return (sigma_val,)
+
+
 class KSamplerSelect:
     @classmethod
     def INPUT_TYPES(s):
@@ -887,6 +916,7 @@ NODE_CLASS_MAPPINGS = {
     "FlipSigmas": FlipSigmas,
     "SetFirstSigma": SetFirstSigma,
     "ExtendIntermediateSigmas": ExtendIntermediateSigmas,
+    "SamplingPercentToSigma": SamplingPercentToSigma,
 
     "CFGGuider": CFGGuider,
     "DualCFGGuider": DualCFGGuider,

From a0c0785635a9f4d2da64b58fef063825f386d8da Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 19 Jul 2025 22:24:09 -0700
Subject: [PATCH 04/24] Document what the fast_fp16_accumulation is in the
 portable. (#8973)

---
 .ci/windows_base_files/README_VERY_IMPORTANT.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.ci/windows_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
index d46acbcbf..8ab70c890 100755
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@@ -4,6 +4,9 @@ if you have a NVIDIA gpu:
 
 run_nvidia_gpu.bat
 
+if you want to enable the fast fp16 accumulation (faster for fp16 models with slightly less quality):
+
+run_nvidia_gpu_fast_fp16_accumulation.bat
 
 
 To run it in slow CPU mode:

From 7d627f764c2137d816a39adbc358cb28c1718a47 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Mon, 21 Jul 2025 03:58:35 +0800
Subject: [PATCH 05/24] Update template to 0.1.39 (#8981)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a7e44095f..8f6a6d112 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.23.4
-comfyui-workflow-templates==0.1.37
+comfyui-workflow-templates==0.1.39
 comfyui-embedded-docs==0.2.4
 torch
 torchsde

From 9a470e073e2742d4edd6e7ea1ce28d861a77d9c4 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 21 Jul 2025 14:05:43 -0400
Subject: [PATCH 06/24] ComfyUI version 0.3.45

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index 7981fbaca..180ecaf8a 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.44"
+__version__ = "0.3.45"
diff --git a/pyproject.toml b/pyproject.toml
index 96ead2157..b1d6d9df6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.44"
+version = "0.3.45"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"

From 54a45b996729b361ea12f473de760e481dcf1f0a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 21 Jul 2025 11:19:14 -0700
Subject: [PATCH 07/24] Replace torchaudio.load with pyav. (#8989)

---
 comfy_extras/nodes_audio.py | 58 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index 8cd647846..38697240e 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -278,6 +278,62 @@ class PreviewAudio(SaveAudio):
                 "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
                 }
 
+def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
+    """Convert audio to float 32 bits PCM format."""
+    if wav.dtype.is_floating_point:
+        return wav
+    elif wav.dtype == torch.int16:
+        return wav.float() / (2 ** 15)
+    elif wav.dtype == torch.int32:
+        return wav.float() / (2 ** 31)
+    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
+
+def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]:
+    with av.open(filepath) as af:
+        if not af.streams.audio:
+            raise ValueError("No audio stream found in the file.")
+
+        stream = af.streams.audio[0]
+        sr = stream.codec_context.sample_rate
+        n_channels = stream.channels
+
+        seek_time = frame_offset / sr if frame_offset > 0 else 0.0
+        duration = num_frames / sr if num_frames > 0 else -1.0
+
+        sample_offset = int(sr * seek_time)
+        num_samples = int(sr * duration) if duration >= 0 else -1
+
+        # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary'
+        seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time
+        af.seek(int(seek_sec / stream.time_base), stream=stream)
+
+        frames = []
+        length = 0
+        for frame in af.decode(streams=stream.index):
+            current_offset = int(frame.rate * frame.pts * frame.time_base)
+            strip = max(0, sample_offset - current_offset)
+
+            buf = torch.from_numpy(frame.to_ndarray())
+            if buf.shape[0] != n_channels:
+                buf = buf.view(-1, n_channels).t()
+
+            buf = buf[:, strip:]
+            frames.append(buf)
+            length += buf.shape[1]
+
+            if num_samples > 0 and length >= num_samples:
+                break
+
+        if not frames:
+            raise ValueError("No audio frames decoded.")
+
+        wav = torch.cat(frames, dim=1)
+        if num_samples > 0:
+            wav = wav[:, :num_samples]
+
+        wav = f32_pcm(wav)
+        return wav, sr
+
 class LoadAudio:
     @classmethod
     def INPUT_TYPES(s):
@@ -292,7 +348,7 @@ class LoadAudio:
 
     def load(self, audio):
         audio_path = folder_paths.get_annotated_filepath(audio)
-        waveform, sample_rate = torchaudio.load(audio_path)
+        waveform, sample_rate = load(audio_path)
         audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}
         return (audio, )
 

From 5249e45a1c7d91656ebefdebe3815005ec3d39d7 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:23:41 -0700
Subject: [PATCH 08/24] Add hidream e1.1 example to readme. (#8990)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0e021a687..d004364ee 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 - Image Editing Models
    - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
    - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
+   - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
 - Video Models
    - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
    - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)

From 0aa1c58b04b27311c6ba38b1d9949e7e20037d00 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 21 Jul 2025 13:48:25 -0700
Subject: [PATCH 09/24] This is not needed. (#8991)

---
 comfy_extras/nodes_audio.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index 38697240e..a90b31779 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -288,7 +288,7 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
         return wav.float() / (2 ** 31)
     raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
 
-def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]:
+def load(filepath: str) -> tuple[torch.Tensor, int]:
     with av.open(filepath) as af:
         if not af.streams.audio:
             raise ValueError("No audio stream found in the file.")
@@ -297,40 +297,20 @@ def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[to
         sr = stream.codec_context.sample_rate
         n_channels = stream.channels
 
-        seek_time = frame_offset / sr if frame_offset > 0 else 0.0
-        duration = num_frames / sr if num_frames > 0 else -1.0
-
-        sample_offset = int(sr * seek_time)
-        num_samples = int(sr * duration) if duration >= 0 else -1
-
-        # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary'
-        seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time
-        af.seek(int(seek_sec / stream.time_base), stream=stream)
-
         frames = []
         length = 0
         for frame in af.decode(streams=stream.index):
-            current_offset = int(frame.rate * frame.pts * frame.time_base)
-            strip = max(0, sample_offset - current_offset)
-
             buf = torch.from_numpy(frame.to_ndarray())
             if buf.shape[0] != n_channels:
                 buf = buf.view(-1, n_channels).t()
 
-            buf = buf[:, strip:]
             frames.append(buf)
             length += buf.shape[1]
 
-            if num_samples > 0 and length >= num_samples:
-                break
-
         if not frames:
             raise ValueError("No audio frames decoded.")
 
         wav = torch.cat(frames, dim=1)
-        if num_samples > 0:
-            wav = wav[:, :num_samples]
-
         wav = f32_pcm(wav)
         return wav, sr
 

From 5ac9ec214ba3ef1632701416f27948a57ec60919 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 22 Jul 2025 01:07:51 -0700
Subject: [PATCH 10/24] Try to fix line endings workflow. (#9001)

---
 .github/workflows/check-line-endings.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml
index f20dca565..03b3e3ced 100644
--- a/.github/workflows/check-line-endings.yml
+++ b/.github/workflows/check-line-endings.yml
@@ -17,6 +17,7 @@ jobs:
       - name: Check for Windows line endings (CRLF)
         run: |
           # Get the list of changed files in the PR
+          git merge origin/${{ github.base_ref }} --no-edit
           CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD)
 
           # Flag to track if CRLF is found

From 255f1398638b265a47d0e74fb4759fe6cfc3b3d4 Mon Sep 17 00:00:00 2001
From: Simon Lui <502929+simonlui@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:20:09 -0700
Subject: [PATCH 11/24] Add xpu version for async offload and some other
 things. (#9004)

---
 comfy/model_management.py | 41 +++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 816caf18f..ab1e9bf3a 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -101,7 +101,7 @@ if args.directml is not None:
     lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
 
 try:
-    import intel_extension_for_pytorch as ipex
+    import intel_extension_for_pytorch as ipex  # noqa: F401
     _ = torch.xpu.device_count()
     xpu_available = xpu_available or torch.xpu.is_available()
 except:
@@ -186,8 +186,12 @@ def get_total_memory(dev=None, torch_total_too=False):
         elif is_intel_xpu():
             stats = torch.xpu.memory_stats(dev)
             mem_reserved = stats['reserved_bytes.all.current']
+            if torch_version_numeric < (2, 6):
+                mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
+            else:
+                _, mem_total_xpu = torch.xpu.mem_get_info(dev)
             mem_total_torch = mem_reserved
-            mem_total = torch.xpu.get_device_properties(dev).total_memory
+            mem_total = mem_total_xpu
         elif is_ascend_npu():
             stats = torch.npu.memory_stats(dev)
             mem_reserved = stats['reserved_bytes.all.current']
@@ -929,7 +933,7 @@ def device_supports_non_blocking(device):
     if is_device_mps(device):
         return False #pytorch bug? mps doesn't support non blocking
     if is_intel_xpu():
-        return False
+        return True
     if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews)
         return False
     if directml_enabled:
@@ -968,6 +972,8 @@ def get_offload_stream(device):
         stream_counter = (stream_counter + 1) % len(ss)
         if is_device_cuda(device):
             ss[stream_counter].wait_stream(torch.cuda.current_stream())
+        elif is_device_xpu(device):
+            ss[stream_counter].wait_stream(torch.xpu.current_stream())
         stream_counters[device] = stream_counter
         return s
     elif is_device_cuda(device):
@@ -979,6 +985,15 @@ def get_offload_stream(device):
         stream_counter = (stream_counter + 1) % len(ss)
         stream_counters[device] = stream_counter
         return s
+    elif is_device_xpu(device):
+        ss = []
+        for k in range(NUM_STREAMS):
+            ss.append(torch.xpu.Stream(device=device, priority=0))
+        STREAMS[device] = ss
+        s = ss[stream_counter]
+        stream_counter = (stream_counter + 1) % len(ss)
+        stream_counters[device] = stream_counter
+        return s
     return None
 
 def sync_stream(device, stream):
@@ -986,6 +1001,8 @@ def sync_stream(device, stream):
         return
     if is_device_cuda(device):
         torch.cuda.current_stream().wait_stream(stream)
+    elif is_device_xpu(device):
+        torch.xpu.current_stream().wait_stream(stream)
 
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
     if device is None or weight.device == device:
@@ -1092,8 +1109,11 @@ def get_free_memory(dev=None, torch_free_too=False):
             stats = torch.xpu.memory_stats(dev)
             mem_active = stats['active_bytes.all.current']
             mem_reserved = stats['reserved_bytes.all.current']
+            if torch_version_numeric < (2, 6):
+                mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
+            else:
+                mem_free_xpu, _ = torch.xpu.mem_get_info(dev)
             mem_free_torch = mem_reserved - mem_active
-            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
             mem_free_total = mem_free_xpu + mem_free_torch
         elif is_ascend_npu():
             stats = torch.npu.memory_stats(dev)
@@ -1142,6 +1162,9 @@ def is_device_cpu(device):
 def is_device_mps(device):
     return is_device_type(device, 'mps')
 
+def is_device_xpu(device):
+    return is_device_type(device, 'xpu')
+
 def is_device_cuda(device):
     return is_device_type(device, 'cuda')
 
@@ -1173,7 +1196,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
         return False
 
     if is_intel_xpu():
-        return True
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.get_device_properties(device).has_fp16
 
     if is_ascend_npu():
         return True
@@ -1236,7 +1262,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
         return False
 
     if is_intel_xpu():
-        return True
+        if torch_version_numeric < (2, 6):
+            return True
+        else:
+            return torch.xpu.get_device_capability(device)['has_bfloat16_conversions']
 
     if is_ascend_npu():
         return True

From 5ad33787dee43d36f8d054c590818b3153b55370 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:20:49 -0700
Subject: [PATCH 12/24] Add default device argument. (#9023)

---
 comfy/cli_args.py         | 3 ++-
 comfy/model_management.py | 1 +
 main.py                   | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index ef0d4337e..0d760d524 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -49,7 +49,8 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
+parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
+parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
diff --git a/comfy/model_management.py b/comfy/model_management.py
index ab1e9bf3a..346673895 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -880,6 +880,7 @@ def vae_dtype(device=None, allowed_dtypes=[]):
             return d
 
         # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
+        # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
         if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
             return d
 
diff --git a/main.py b/main.py
index 2b4ffafd4..e8ca8152a 100644
--- a/main.py
+++ b/main.py
@@ -115,6 +115,15 @@ if os.name == "nt":
     logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())
 
 if __name__ == "__main__":
+    if args.default_device is not None:
+        default_dev = args.default_device
+        devices = list(range(32))
+        devices.remove(default_dev)
+        devices.insert(0, default_dev)
+        devices = ','.join(map(str, devices))
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(devices)
+        os.environ['HIP_VISIBLE_DEVICES'] = str(devices)
+
     if args.cuda_device is not None:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device)
         os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device)

From 39dda1d40d1f2f18ccda8ade860932d0b8a07af4 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 23 Jul 2025 15:10:59 -0700
Subject: [PATCH 13/24] Fix xpu function not implemented. (#9026)

---
 comfy/model_management.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 346673895..746b063ed 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -186,10 +186,7 @@ def get_total_memory(dev=None, torch_total_too=False):
         elif is_intel_xpu():
             stats = torch.xpu.memory_stats(dev)
             mem_reserved = stats['reserved_bytes.all.current']
-            if torch_version_numeric < (2, 6):
-                mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
-            else:
-                _, mem_total_xpu = torch.xpu.mem_get_info(dev)
+            mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
             mem_total_torch = mem_reserved
             mem_total = mem_total_xpu
         elif is_ascend_npu():

From a86a58c308c2423e86054462a8c9f1125536a034 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 23 Jul 2025 15:18:20 -0700
Subject: [PATCH 14/24] Fix xpu function not implemented p2. (#9027)

---
 comfy/model_management.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 746b063ed..42873d09b 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1107,10 +1107,7 @@ def get_free_memory(dev=None, torch_free_too=False):
             stats = torch.xpu.memory_stats(dev)
             mem_active = stats['active_bytes.all.current']
             mem_reserved = stats['reserved_bytes.all.current']
-            if torch_version_numeric < (2, 6):
-                mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
-            else:
-                mem_free_xpu, _ = torch.xpu.mem_get_info(dev)
+            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
             mem_free_torch = mem_reserved - mem_active
             mem_free_total = mem_free_xpu + mem_free_torch
         elif is_ascend_npu():

From d3504e1778c0cc8992b04fe30dc0fae239c13713 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 23 Jul 2025 16:21:29 -0700
Subject: [PATCH 15/24] Enable pytorch attention by default for gfx1201 on
 torch 2.8 (#9029)

---
 comfy/model_management.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 42873d09b..e8b9b5c81 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -308,7 +308,10 @@ try:
         logging.info("ROCm version: {}".format(rocm_version))
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx1201 and gfx950
+                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
+                    ENABLE_PYTORCH_ATTENTION = True
+            if torch_version_numeric >= (2, 8):
+                if any((a in arch) for a in ["gfx1201"]):
                     ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
             if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches

From e78d2304966b6265fa2320b4d87dca534ea15642 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 23 Jul 2025 16:37:43 -0700
Subject: [PATCH 16/24] Only enable cuda malloc on cuda torch. (#9031)

---
 cuda_malloc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_malloc.py b/cuda_malloc.py
index eb2857c5f..c1d9ae3ca 100644
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@@ -74,7 +74,8 @@ if not args.cuda_malloc:
                 module = importlib.util.module_from_spec(spec)
                 spec.loader.exec_module(module)
                 version = module.__version__
-        if int(version[0]) >= 2: #enable by default for torch version 2.0 and up
+
+        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
             args.cuda_malloc = cuda_malloc_supported()
     except:
         pass

From e729a5cc1157bc0ece7daae9583c3a5a3ba95fbb Mon Sep 17 00:00:00 2001
From: chaObserv <154517000+chaObserv@users.noreply.github.com>
Date: Thu, 24 Jul 2025 07:47:05 +0800
Subject: [PATCH 17/24] Separate denoised and noise estimation in Euler CFG++
 (#9008)

This will change their behavior with the sampling CONST type.
It also combines euler_cfg_pp and euler_ancestral_cfg_pp into one main function.
---
 comfy/k_diffusion/sampling.py | 64 +++++++++++++++++------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index 2ed415b1f..a2bc492fd 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -1210,39 +1210,21 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None,
     return x_next
 
 
-@torch.no_grad()
-def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
-    extra_args = {} if extra_args is None else extra_args
-
-    temp = [0]
-    def post_cfg_function(args):
-        temp[0] = args["uncond_denoised"]
-        return args["denoised"]
-
-    model_options = extra_args.get("model_options", {}).copy()
-    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        sigma_hat = sigmas[i]
-        denoised = model(x, sigma_hat * s_in, **extra_args)
-        d = to_d(x, sigma_hat, temp[0])
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        # Euler method
-        x = denoised + d * sigmas[i + 1]
-    return x
-
 @torch.no_grad()
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps."""
+    """Ancestral sampling with Euler method steps (CFG++)."""
     extra_args = {} if extra_args is None else extra_args
     seed = extra_args.get("seed", None)
     noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
 
-    temp = [0]
+    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
+    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
+
+    uncond_denoised = None
+
     def post_cfg_function(args):
-        temp[0] = args["uncond_denoised"]
+        nonlocal uncond_denoised
+        uncond_denoised = args["uncond_denoised"]
         return args["denoised"]
 
     model_options = extra_args.get("model_options", {}).copy()
@@ -1251,15 +1233,33 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
     s_in = x.new_ones([x.shape[0]])
     for i in trange(len(sigmas) - 1, disable=disable):
         denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
         if callback is not None:
             callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], temp[0])
-        # Euler method
-        x = denoised + d * sigma_down
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+        if sigmas[i + 1] == 0:
+            # Denoising step
+            x = denoised
+        else:
+            alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp()
+            alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp()
+            d = to_d(x, sigmas[i], alpha_s * uncond_denoised)   # to noise
+
+            # DDIM stochastic sampling
+            sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta)
+            sigma_down = alpha_t * sigma_down
+
+            # Euler method
+            x = alpha_t * denoised + sigma_down * d
+            if eta > 0 and s_noise > 0:
+                x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
     return x
+
+
+@torch.no_grad()
+def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
+    """Euler method steps (CFG++)."""
+    return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None)
+
+
 @torch.no_grad()
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
     """Ancestral sampling with DPM-Solver++(2S) second-order steps."""

From eb2f78b4e09b1970e2fc51fc5d2e062f1a826399 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 24 Jul 2025 08:57:27 +0800
Subject: [PATCH 18/24] [Training Node] algo support, grad acc, optional grad
 ckpt (#9015)

* Add factorization utils for lokr

* Add lokr train impl

* Add loha train impl

* Add adapter map for algo selection

* Add optional grad ckpt and algo selection

* Update __init__.py

* correct key name for loha

* Use custom fwd/bwd func and better init for loha

* Support gradient accumulation

* Fix bugs of loha

* use more stable init

* Add OFT training

* linting
---
 comfy/weight_adapter/__init__.py |  13 ++-
 comfy/weight_adapter/base.py     |  40 +++++++++
 comfy/weight_adapter/loha.py     | 134 ++++++++++++++++++++++++++++++-
 comfy/weight_adapter/lokr.py     |  86 +++++++++++++++++++-
 comfy/weight_adapter/oft.py      |  67 +++++++++++++++-
 comfy_extras/nodes_train.py      |  47 ++++++++---
 6 files changed, 372 insertions(+), 15 deletions(-)

diff --git a/comfy/weight_adapter/__init__.py b/comfy/weight_adapter/__init__.py
index 560b82be3..b40f920e4 100644
--- a/comfy/weight_adapter/__init__.py
+++ b/comfy/weight_adapter/__init__.py
@@ -15,9 +15,20 @@ adapters: list[type[WeightAdapterBase]] = [
     OFTAdapter,
     BOFTAdapter,
 ]
+adapter_maps: dict[str, type[WeightAdapterBase]] = {
+    "LoRA": LoRAAdapter,
+    "LoHa": LoHaAdapter,
+    "LoKr": LoKrAdapter,
+    "OFT": OFTAdapter,
+    ## We disable not implemented algo for now
+    # "GLoRA": GLoRAAdapter,
+    # "BOFT": BOFTAdapter,
+}
+
 
 __all__ = [
     "WeightAdapterBase",
     "WeightAdapterTrainBase",
-    "adapters"
+    "adapters",
+    "adapter_maps",
 ] + [a.__name__ for a in adapters]
diff --git a/comfy/weight_adapter/base.py b/comfy/weight_adapter/base.py
index b5c7db423..43644b106 100644
--- a/comfy/weight_adapter/base.py
+++ b/comfy/weight_adapter/base.py
@@ -133,3 +133,43 @@ def tucker_weight_from_conv(up, down, mid):
 def tucker_weight(wa, wb, t):
     temp = torch.einsum("i j ..., j r -> i r ...", t, wb)
     return torch.einsum("i j ..., i r -> r j ...", temp, wa)
+
+
+def factorization(dimension: int, factor: int = -1) -> tuple[int, int]:
+    """
+    return a tuple of two value of input dimension decomposed by the number closest to factor
+    second value is higher or equal than first value.
+
+    examples)
+    factor
+        -1               2                4               8               16               ...
+    127 -> 1, 127   127 -> 1, 127    127 -> 1, 127   127 -> 1, 127   127 -> 1, 127
+    128 -> 8, 16    128 -> 2, 64     128 -> 4, 32    128 -> 8, 16    128 -> 8, 16
+    250 -> 10, 25   250 -> 2, 125    250 -> 2, 125   250 -> 5, 50    250 -> 10, 25
+    360 -> 8, 45    360 -> 2, 180    360 -> 4, 90    360 -> 8, 45    360 -> 12, 30
+    512 -> 16, 32   512 -> 2, 256    512 -> 4, 128   512 -> 8, 64    512 -> 16, 32
+    1024 -> 32, 32  1024 -> 2, 512   1024 -> 4, 256  1024 -> 8, 128  1024 -> 16, 64
+    """
+
+    if factor > 0 and (dimension % factor) == 0 and dimension >= factor**2:
+        m = factor
+        n = dimension // factor
+        if m > n:
+            n, m = m, n
+        return m, n
+    if factor < 0:
+        factor = dimension
+    m, n = 1, dimension
+    length = m + n
+    while m < n:
+        new_m = m + 1
+        while dimension % new_m != 0:
+            new_m += 1
+        new_n = dimension // new_m
+        if new_m + new_n > length or new_m > factor:
+            break
+        else:
+            m, n = new_m, new_n
+    if m > n:
+        n, m = m, n
+    return m, n
diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py
index ce79abad5..55c97a3af 100644
--- a/comfy/weight_adapter/loha.py
+++ b/comfy/weight_adapter/loha.py
@@ -3,7 +3,120 @@ from typing import Optional
 
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose
+
+
+class HadaWeight(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)):
+        ctx.save_for_backward(w1d, w1u, w2d, w2u, scale)
+        diff_weight = ((w1u @ w1d) * (w2u @ w2d)) * scale
+        return diff_weight
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (w1d, w1u, w2d, w2u, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+        temp = grad_out * (w2u @ w2d)
+        grad_w1u = temp @ w1d.T
+        grad_w1d = w1u.T @ temp
+
+        temp = grad_out * (w1u @ w1d)
+        grad_w2u = temp @ w2d.T
+        grad_w2d = w2u.T @ temp
+
+        del temp
+        return grad_w1u, grad_w1d, grad_w2u, grad_w2d, None
+
+
+class HadaWeightTucker(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, t1, w1u, w1d, t2, w2u, w2d, scale=torch.tensor(1)):
+        ctx.save_for_backward(t1, w1d, w1u, t2, w2d, w2u, scale)
+
+        rebuild1 = torch.einsum("i j ..., j r, i p -> p r ...", t1, w1d, w1u)
+        rebuild2 = torch.einsum("i j ..., j r, i p -> p r ...", t2, w2d, w2u)
+
+        return rebuild1 * rebuild2 * scale
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (t1, w1d, w1u, t2, w2d, w2u, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+
+        temp = torch.einsum("i j ..., j r -> i r ...", t2, w2d)
+        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w2u)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w1u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w1u.T)
+        del grad_w, temp
+
+        grad_w1d = torch.einsum("i r ..., i j ... -> r j", t1, grad_temp)
+        grad_t1 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w1d.T)
+        del grad_temp
+
+        temp = torch.einsum("i j ..., j r -> i r ...", t1, w1d)
+        rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w1u)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w2u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w2u.T)
+        del grad_w, temp
+
+        grad_w2d = torch.einsum("i r ..., i j ... -> r j", t2, grad_temp)
+        grad_t2 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w2d.T)
+        del grad_temp
+        return grad_t1, grad_w1u, grad_w1d, grad_t2, grad_w2u, grad_w2d, None
+
+
+class LohaDiff(WeightAdapterTrainBase):
+    def __init__(self, weights):
+        super().__init__()
+        # Unpack weights tuple from LoHaAdapter
+        w1a, w1b, alpha, w2a, w2b, t1, t2, _ = weights
+
+        # Create trainable parameters
+        self.hada_w1_a = torch.nn.Parameter(w1a)
+        self.hada_w1_b = torch.nn.Parameter(w1b)
+        self.hada_w2_a = torch.nn.Parameter(w2a)
+        self.hada_w2_b = torch.nn.Parameter(w2b)
+
+        self.use_tucker = False
+        if t1 is not None and t2 is not None:
+            self.use_tucker = True
+            self.hada_t1 = torch.nn.Parameter(t1)
+            self.hada_t2 = torch.nn.Parameter(t2)
+        else:
+            # Keep the attributes for consistent access
+            self.hada_t1 = None
+            self.hada_t2 = None
+
+        # Store rank and non-trainable alpha
+        self.rank = w1b.shape[0]
+        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
+
+    def __call__(self, w):
+        org_dtype = w.dtype
+
+        scale = self.alpha / self.rank
+        if self.use_tucker:
+            diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale)
+        else:
+            diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale)
+
+        # Add the scaled difference to the original weight
+        weight = w.to(diff_weight) + diff_weight.reshape(w.shape)
+
+        return weight.to(org_dtype)
+
+    def passive_memory_usage(self):
+        """Calculates memory usage of the trainable parameters."""
+        return sum(param.numel() * param.element_size() for param in self.parameters())
 
 
 class LoHaAdapter(WeightAdapterBase):
@@ -13,6 +126,25 @@ class LoHaAdapter(WeightAdapterBase):
         self.loaded_keys = loaded_keys
         self.weights = weights
 
+    @classmethod
+    def create_train(cls, weight, rank=1, alpha=1.0):
+        out_dim = weight.shape[0]
+        in_dim = weight.shape[1:].numel()
+        mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
+        mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
+        torch.nn.init.normal_(mat1, 0.1)
+        torch.nn.init.constant_(mat2, 0.0)
+        mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype)
+        mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype)
+        torch.nn.init.normal_(mat3, 0.1)
+        torch.nn.init.normal_(mat4, 0.01)
+        return LohaDiff(
+            (mat1, mat2, alpha, mat3, mat4, None, None, None)
+        )
+
+    def to_train(self):
+        return LohaDiff(self.weights)
+
     @classmethod
     def load(
         cls,
diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py
index 51233db2d..49b0be55f 100644
--- a/comfy/weight_adapter/lokr.py
+++ b/comfy/weight_adapter/lokr.py
@@ -3,7 +3,77 @@ from typing import Optional
 
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import (
+    WeightAdapterBase,
+    WeightAdapterTrainBase,
+    weight_decompose,
+    factorization,
+)
+
+
+class LokrDiff(WeightAdapterTrainBase):
+    def __init__(self, weights):
+        super().__init__()
+        (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights
+        self.use_tucker = False
+        if lokr_w1_a is not None:
+            _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1]
+            rank_a, _ = lokr_w1_b.shape[0], lokr_w1_b.shape[1]
+            self.lokr_w1_a = torch.nn.Parameter(lokr_w1_a)
+            self.lokr_w1_b = torch.nn.Parameter(lokr_w1_b)
+            self.w1_rebuild = True
+            self.ranka = rank_a
+
+        if lokr_w2_a is not None:
+            _, rank_b = lokr_w2_a.shape[0], lokr_w2_a.shape[1]
+            rank_b, _ = lokr_w2_b.shape[0], lokr_w2_b.shape[1]
+            self.lokr_w2_a = torch.nn.Parameter(lokr_w2_a)
+            self.lokr_w2_b = torch.nn.Parameter(lokr_w2_b)
+            if lokr_t2 is not None:
+                self.use_tucker = True
+                self.lokr_t2 = torch.nn.Parameter(lokr_t2)
+            self.w2_rebuild = True
+            self.rankb = rank_b
+
+        if lokr_w1 is not None:
+            self.lokr_w1 = torch.nn.Parameter(lokr_w1)
+            self.w1_rebuild = False
+
+        if lokr_w2 is not None:
+            self.lokr_w2 = torch.nn.Parameter(lokr_w2)
+            self.w2_rebuild = False
+
+        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
+
+    @property
+    def w1(self):
+        if self.w1_rebuild:
+            return (self.lokr_w1_a @ self.lokr_w1_b) * (self.alpha / self.ranka)
+        else:
+            return self.lokr_w1
+
+    @property
+    def w2(self):
+        if self.w2_rebuild:
+            if self.use_tucker:
+                w2 = torch.einsum(
+                    'i j k l, j r, i p -> p r k l',
+                    self.lokr_t2,
+                    self.lokr_w2_b,
+                    self.lokr_w2_a
+                )
+            else:
+                w2 = self.lokr_w2_a @ self.lokr_w2_b
+            return w2 * (self.alpha / self.rankb)
+        else:
+            return self.lokr_w2
+
+    def __call__(self, w):
+        diff = torch.kron(self.w1, self.w2)
+        return w + diff.reshape(w.shape).to(w)
+
+    def passive_memory_usage(self):
+        return sum(param.numel() * param.element_size() for param in self.parameters())
 
 
 class LoKrAdapter(WeightAdapterBase):
@@ -13,6 +83,20 @@ class LoKrAdapter(WeightAdapterBase):
         self.loaded_keys = loaded_keys
         self.weights = weights
 
+    @classmethod
+    def create_train(cls, weight, rank=1, alpha=1.0):
+        out_dim = weight.shape[0]
+        in_dim = weight.shape[1:].numel()
+        out1, out2 = factorization(out_dim, rank)
+        in1, in2 = factorization(in_dim, rank)
+        mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype)
+        mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype)
+        torch.nn.init.kaiming_uniform_(mat2, a=5**0.5)
+        torch.nn.init.constant_(mat1, 0.0)
+        return LokrDiff(
+            (mat1, mat2, alpha, None, None, None, None, None, None)
+        )
+
     @classmethod
     def load(
         cls,
diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py
index 25009eca3..9d4982083 100644
--- a/comfy/weight_adapter/oft.py
+++ b/comfy/weight_adapter/oft.py
@@ -3,7 +3,58 @@ from typing import Optional
 
 import torch
 import comfy.model_management
-from .base import WeightAdapterBase, weight_decompose
+from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization
+
+
+class OFTDiff(WeightAdapterTrainBase):
+    def __init__(self, weights):
+        super().__init__()
+        # Unpack weights tuple from LoHaAdapter
+        blocks, rescale, alpha, _ = weights
+
+        # Create trainable parameters
+        self.oft_blocks = torch.nn.Parameter(blocks)
+        if rescale is not None:
+            self.rescale = torch.nn.Parameter(rescale)
+            self.rescaled = True
+        else:
+            self.rescaled = False
+        self.block_num, self.block_size, _ = blocks.shape
+        self.constraint = float(alpha)
+        self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False)
+
+    def __call__(self, w):
+        org_dtype = w.dtype
+        I = torch.eye(self.block_size, device=self.oft_blocks.device)
+
+        ## generate r
+        # for Q = -Q^T
+        q = self.oft_blocks - self.oft_blocks.transpose(1, 2)
+        normed_q = q
+        if self.constraint:
+            q_norm = torch.norm(q) + 1e-8
+            if q_norm > self.constraint:
+                normed_q = q * self.constraint / q_norm
+        # use float() to prevent unsupported type
+        r = (I + normed_q) @ (I - normed_q).float().inverse()
+
+        ## Apply chunked matmul on weight
+        _, *shape = w.shape
+        org_weight = w.to(dtype=r.dtype)
+        org_weight = org_weight.unflatten(0, (self.block_num, self.block_size))
+        # Init R=0, so add I on it to ensure the output of step0 is original model output
+        weight = torch.einsum(
+            "k n m, k n ... -> k m ...",
+            r,
+            org_weight,
+        ).flatten(0, 1)
+        if self.rescaled:
+            weight = self.rescale * weight
+        return weight.to(org_dtype)
+
+    def passive_memory_usage(self):
+        """Calculates memory usage of the trainable parameters."""
+        return sum(param.numel() * param.element_size() for param in self.parameters())
 
 
 class OFTAdapter(WeightAdapterBase):
@@ -13,6 +64,18 @@ class OFTAdapter(WeightAdapterBase):
         self.loaded_keys = loaded_keys
         self.weights = weights
 
+    @classmethod
+    def create_train(cls, weight, rank=1, alpha=1.0):
+        out_dim = weight.shape[0]
+        block_size, block_num = factorization(out_dim, rank)
+        block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype)
+        return OFTDiff(
+            (block, None, alpha, None)
+        )
+
+    def to_train(self):
+        return OFTDiff(self.weights)
+
     @classmethod
     def load(
         cls,
@@ -60,6 +123,8 @@ class OFTAdapter(WeightAdapterBase):
         blocks = v[0]
         rescale = v[1]
         alpha = v[2]
+        if alpha is None:
+            alpha = 0
         dora_scale = v[3]
 
         blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype)
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index 3d05fdab5..c3aaaee9b 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -20,7 +20,7 @@ import folder_paths
 import node_helpers
 from comfy.cli_args import args
 from comfy.comfy_types.node_typing import IO
-from comfy.weight_adapter import adapters
+from comfy.weight_adapter import adapters, adapter_maps
 
 
 def make_batch_extra_option_dict(d, indicies, full_size=None):
@@ -39,13 +39,13 @@ def make_batch_extra_option_dict(d, indicies, full_size=None):
 
 
 class TrainSampler(comfy.samplers.Sampler):
-
-    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
+    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
         self.loss_fn = loss_fn
         self.optimizer = optimizer
         self.loss_callback = loss_callback
         self.batch_size = batch_size
         self.total_steps = total_steps
+        self.grad_acc = grad_acc
         self.seed = seed
         self.training_dtype = training_dtype
 
@@ -92,8 +92,9 @@ class TrainSampler(comfy.samplers.Sampler):
                 self.loss_callback(loss.item())
             pbar.set_postfix({"loss": f"{loss.item():.4f}"})
 
-            self.optimizer.step()
-            self.optimizer.zero_grad()
+            if (i+1) % self.grad_acc == 0:
+                self.optimizer.step()
+                self.optimizer.zero_grad()
         torch.cuda.empty_cache()
         return torch.zeros_like(latent_image)
 
@@ -419,6 +420,16 @@ class TrainLoraNode:
                         "tooltip": "The batch size to use for training.",
                     },
                 ),
+                "grad_accumulation_steps": (
+                    IO.INT,
+                    {
+                        "default": 1,
+                        "min": 1,
+                        "max": 1024,
+                        "step": 1,
+                        "tooltip": "The number of gradient accumulation steps to use for training.",
+                    }
+                ),
                 "steps": (
                     IO.INT,
                     {
@@ -478,6 +489,17 @@ class TrainLoraNode:
                     ["bf16", "fp32"],
                     {"default": "bf16", "tooltip": "The dtype to use for lora."},
                 ),
+                "algorithm": (
+                    list(adapter_maps.keys()),
+                    {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."},
+                ),
+                "gradient_checkpointing": (
+                    IO.BOOLEAN,
+                    {
+                        "default": True,
+                        "tooltip": "Use gradient checkpointing for training.",
+                    }
+                ),
                 "existing_lora": (
                     folder_paths.get_filename_list("loras") + ["[None]"],
                     {
@@ -501,6 +523,7 @@ class TrainLoraNode:
         positive,
         batch_size,
         steps,
+        grad_accumulation_steps,
         learning_rate,
         rank,
         optimizer,
@@ -508,6 +531,8 @@ class TrainLoraNode:
         seed,
         training_dtype,
         lora_dtype,
+        algorithm,
+        gradient_checkpointing,
         existing_lora,
     ):
         mp = model.clone()
@@ -558,10 +583,8 @@ class TrainLoraNode:
                                 if existing_adapter is not None:
                                     break
                             else:
-                                # If no existing adapter found, use LoRA
-                                # We will add algo option in the future
                                 existing_adapter = None
-                                adapter_cls = adapters[0]
+                                adapter_cls = adapter_maps[algorithm]
 
                             if existing_adapter is not None:
                                 train_adapter = existing_adapter.to_train().to(lora_dtype)
@@ -615,8 +638,9 @@ class TrainLoraNode:
                 criterion = torch.nn.SmoothL1Loss()
 
             # setup models
-            for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
-                patch(m)
+            if gradient_checkpointing:
+                for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
+                    patch(m)
             mp.model.requires_grad_(False)
             comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True)
 
@@ -629,7 +653,8 @@ class TrainLoraNode:
                 optimizer,
                 loss_callback=loss_callback,
                 batch_size=batch_size,
-                total_steps=steps,
+                grad_acc=grad_accumulation_steps,
+                total_steps=steps*grad_accumulation_steps,
                 seed=seed,
                 training_dtype=dtype
             )

From 0ccc88b03fbe190135e24ac04612565f8f0756b4 Mon Sep 17 00:00:00 2001
From: honglyua <hongliang.yuan@iluvatar.com>
Date: Fri, 25 Jul 2025 01:57:36 +0800
Subject: [PATCH 19/24] Support Iluvatar CoreX (#8585)

* Support Iluvatar CoreX
Co-authored-by: mingjiang.li <mingjiang.li@iluvatar.com>
---
 README.md                 |  7 +++++++
 comfy/model_management.py | 23 ++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d004364ee..a148623cd 100644
--- a/README.md
+++ b/README.md
@@ -294,6 +294,13 @@ For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a
 2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
 3. Launch ComfyUI by running `python main.py`
 
+#### Iluvatar Corex
+
+For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
+2. Launch ComfyUI by running `python main.py`
+
 # Running
 
 ```python main.py```
diff --git a/comfy/model_management.py b/comfy/model_management.py
index e8b9b5c81..9add54ceb 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -128,6 +128,11 @@ try:
 except:
     mlu_available = False
 
+try:
+    ixuca_available = hasattr(torch, "corex")
+except:
+    ixuca_available = False
+
 if args.cpu:
     cpu_state = CPUState.CPU
 
@@ -151,6 +156,12 @@ def is_mlu():
         return True
     return False
 
+def is_ixuca():
+    global ixuca_available
+    if ixuca_available:
+        return True
+    return False
+
 def get_torch_device():
     global directml_enabled
     global cpu_state
@@ -289,7 +300,7 @@ try:
         if torch_version_numeric[0] >= 2:
             if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                 ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu() or is_ascend_npu() or is_mlu():
+    if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca():
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
             ENABLE_PYTORCH_ATTENTION = True
 except:
@@ -1045,6 +1056,8 @@ def xformers_enabled():
         return False
     if is_mlu():
         return False
+    if is_ixuca():
+        return False
     if directml_enabled:
         return False
     return XFORMERS_IS_AVAILABLE
@@ -1080,6 +1093,8 @@ def pytorch_attention_flash_attention():
             return True
         if is_amd():
             return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
+        if is_ixuca():
+            return True
     return False
 
 def force_upcast_attention_dtype():
@@ -1205,6 +1220,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
     if is_mlu():
         return True
 
+    if is_ixuca():
+        return True
+
     if torch.version.hip:
         return True
 
@@ -1268,6 +1286,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
     if is_ascend_npu():
         return True
 
+    if is_ixuca():
+        return True
+
     if is_amd():
         arch = torch.cuda.get_device_properties(device).gcnArchName
         if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16

From d03ae077b4330f58e7caba53ff94e7fd58d0dc7d Mon Sep 17 00:00:00 2001
From: SHIVANSH GUPTA <121501003+shivansh-gupta4@users.noreply.github.com>
Date: Thu, 24 Jul 2025 23:35:54 +0530
Subject: [PATCH 20/24] Added parameter required_frontend_version in the
 /system_stats API response (#8875)

* Added the parameter required_frontend_version in the /system_stats  api response

* Update server.py

* Created a function get_required_frontend_version and wrote tests for it

* Refactored the function to return currently installed frontend pacakage version

* Moved required_frontend to a new function and imported that in server.py

* Corrected test cases using mocking techniques

* Corrected files to comply with ruff formatting
---
 app/frontend_management.py                   | 47 +++++++++++++++++---
 server.py                                    |  2 +
 tests-unit/app_test/frontend_manager_test.py | 35 ++++++++++++++-
 3 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/app/frontend_management.py b/app/frontend_management.py
index 001ebbecb..0bee73685 100644
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -29,18 +29,48 @@ def frontend_install_warning_message():
 This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
 """.strip()
 
+def parse_version(version: str) -> tuple[int, int, int]:
+        return tuple(map(int, version.split(".")))
+
+def is_valid_version(version: str) -> bool:
+    """Validate if a string is a valid semantic version (X.Y.Z format)."""
+    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
+    return bool(re.match(pattern, version))
+
+def get_installed_frontend_version():
+    """Get the currently installed frontend package version."""
+    frontend_version_str = version("comfyui-frontend-package")
+    return frontend_version_str
+
+def get_required_frontend_version():
+    """Get the required frontend version from requirements.txt."""
+    try:
+        with open(requirements_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith("comfyui-frontend-package=="):
+                    version_str = line.split("==")[-1]
+                    if not is_valid_version(version_str):
+                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
+                        return None
+                    return version_str
+            logging.error("comfyui-frontend-package not found in requirements.txt")
+            return None
+    except FileNotFoundError:
+        logging.error("requirements.txt not found. Cannot determine required frontend version.")
+        return None
+    except Exception as e:
+        logging.error(f"Error reading requirements.txt: {e}")
+        return None
 
 def check_frontend_version():
     """Check if the frontend version is up to date."""
 
-    def parse_version(version: str) -> tuple[int, int, int]:
-        return tuple(map(int, version.split(".")))
-
     try:
-        frontend_version_str = version("comfyui-frontend-package")
+        frontend_version_str = get_installed_frontend_version()
         frontend_version = parse_version(frontend_version_str)
-        with open(requirements_path, "r", encoding="utf-8") as f:
-            required_frontend = parse_version(f.readline().split("=")[-1])
+        required_frontend_str = get_required_frontend_version()
+        required_frontend = parse_version(required_frontend_str)
         if frontend_version < required_frontend:
             app.logger.log_startup_warning(
                 f"""
@@ -168,6 +198,11 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:
 class FrontendManager:
     CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
 
+    @classmethod
+    def get_required_frontend_version(cls) -> str:
+        """Get the required frontend package version."""
+        return get_required_frontend_version()
+
     @classmethod
     def default_frontend_path(cls) -> str:
         try:
diff --git a/server.py b/server.py
index 71a58f0fa..f4de0079b 100644
--- a/server.py
+++ b/server.py
@@ -553,6 +553,7 @@ class PromptServer():
             ram_free = comfy.model_management.get_free_memory(cpu_device)
             vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True)
             vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True)
+            required_frontend_version = FrontendManager.get_required_frontend_version()
 
             system_stats = {
                 "system": {
@@ -560,6 +561,7 @@ class PromptServer():
                     "ram_total": ram_total,
                     "ram_free": ram_free,
                     "comfyui_version": __version__,
+                    "required_frontend_version": required_frontend_version,
                     "python_version": sys.version,
                     "pytorch_version": comfy.model_management.torch_version,
                     "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded",
diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py
index ce67df6c6..ce43ac564 100644
--- a/tests-unit/app_test/frontend_manager_test.py
+++ b/tests-unit/app_test/frontend_manager_test.py
@@ -1,7 +1,7 @@
 import argparse
 import pytest
 from requests.exceptions import HTTPError
-from unittest.mock import patch
+from unittest.mock import patch, mock_open
 
 from app.frontend_management import (
     FrontendManager,
@@ -172,3 +172,36 @@ def test_init_frontend_fallback_on_error():
     # Assert
     assert frontend_path == "/default/path"
     mock_check.assert_called_once()
+
+
+def test_get_frontend_version():
+    # Arrange
+    expected_version = "1.25.0"
+    mock_requirements_content = """torch
+torchsde
+comfyui-frontend-package==1.25.0
+other-package==1.0.0
+numpy"""
+
+    # Act
+    with patch("builtins.open", mock_open(read_data=mock_requirements_content)):
+        version = FrontendManager.get_required_frontend_version()
+
+    # Assert
+    assert version == expected_version
+
+
+def test_get_frontend_version_invalid_semver():
+    # Arrange
+    mock_requirements_content = """torch
+torchsde
+comfyui-frontend-package==1.29.3.75
+other-package==1.0.0
+numpy"""
+
+    # Act
+    with patch("builtins.open", mock_open(read_data=mock_requirements_content)):
+        version = FrontendManager.get_required_frontend_version()
+
+    # Assert
+    assert version is None

From 69cb57b3426b08a82e7fb713b0b48c23725f3da7 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 24 Jul 2025 12:06:25 -0700
Subject: [PATCH 21/24] Print xpu device name. (#9035)

---
 comfy/model_management.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 9add54ceb..232d363aa 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -392,6 +392,8 @@ def get_torch_device_name(device):
             except:
                 allocator_backend = ""
             return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend)
+        elif device.type == "xpu":
+            return "{} {}".format(device, torch.xpu.get_device_name(device))
         else:
             return "{}".format(device.type)
     elif is_intel_xpu():

From 4293e4da214f77a3fde97c15f0691307e61bc18d Mon Sep 17 00:00:00 2001
From: Eugene Fairley <ecfairley@ucdavis.edu>
Date: Thu, 24 Jul 2025 17:59:19 -0700
Subject: [PATCH 22/24] Add WAN ATI support (#8874)

* Add WAN ATI support

* Fixes

* Fix length

* Remove extra functions

* Fix

* Fix

* Ruff fix

* Remove torch.no_grad

* Add batch trajectory logic

* Scale inputs before and after motion patch

* Batch image/trajectory

* Ruff fix

* Clean up
---
 comfy/utils.py            |  20 +++
 comfy_extras/nodes_wan.py | 305 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 324 insertions(+), 1 deletion(-)

diff --git a/comfy/utils.py b/comfy/utils.py
index 9c076a0e0..fab28cf08 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -698,6 +698,26 @@ def resize_to_batch_size(tensor, batch_size):
 
     return output
 
+def resize_list_to_batch_size(l, batch_size):
+    in_batch_size = len(l)
+    if in_batch_size == batch_size or in_batch_size == 0:
+        return l
+
+    if batch_size <= 1:
+        return l[:batch_size]
+
+    output = []
+    if batch_size < in_batch_size:
+        scale = (in_batch_size - 1) / (batch_size - 1)
+        for i in range(batch_size):
+            output.append(l[min(round(i * scale), in_batch_size - 1)])
+    else:
+        scale = in_batch_size / batch_size
+        for i in range(batch_size):
+           output.append(l[min(math.floor((i + 0.5) * scale), in_batch_size - 1)])
+
+    return output
+
 def convert_sd_to(state_dict, dtype):
     keys = list(state_dict.keys())
     for k in keys:
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index d6097a104..d71908f31 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1,3 +1,4 @@
+import math
 import nodes
 import node_helpers
 import torch
@@ -5,7 +6,9 @@ import comfy.model_management
 import comfy.utils
 import comfy.latent_formats
 import comfy.clip_vision
-
+import json
+import numpy as np
+from typing import Tuple
 
 class WanImageToVideo:
     @classmethod
@@ -383,7 +386,307 @@ class WanPhantomSubjectToVideo:
         out_latent["samples"] = latent
         return (positive, cond2, negative, out_latent)
 
+def parse_json_tracks(tracks):
+    """Parse JSON track data into a standardized format"""
+    tracks_data = []
+    try:
+        # If tracks is a string, try to parse it as JSON
+        if isinstance(tracks, str):
+            parsed = json.loads(tracks.replace("'", '"'))
+            tracks_data.extend(parsed)
+        else:
+            # If tracks is a list of strings, parse each one
+            for track_str in tracks:
+                parsed = json.loads(track_str.replace("'", '"'))
+                tracks_data.append(parsed)
+
+        # Check if we have a single track (dict with x,y) or a list of tracks
+        if tracks_data and isinstance(tracks_data[0], dict) and 'x' in tracks_data[0]:
+            # Single track detected, wrap it in a list
+            tracks_data = [tracks_data]
+        elif tracks_data and isinstance(tracks_data[0], list) and tracks_data[0] and isinstance(tracks_data[0][0], dict) and 'x' in tracks_data[0][0]:
+            # Already a list of tracks, nothing to do
+            pass
+        else:
+            # Unexpected format
+            pass
+
+    except json.JSONDecodeError:
+        tracks_data = []
+    return tracks_data
+
+def process_tracks(tracks_np: np.ndarray, frame_size: Tuple[int, int], num_frames, quant_multi: int = 8, **kwargs):
+    # tracks: shape [t, h, w, 3] => samples align with 24 fps, model trained with 16 fps.
+    # frame_size: tuple (W, H)
+    tracks = torch.from_numpy(tracks_np).float()
+
+    if tracks.shape[1] == 121:
+        tracks = torch.permute(tracks, (1, 0, 2, 3))
+
+    tracks, visibles = tracks[..., :2], tracks[..., 2:3]
+
+    short_edge = min(*frame_size)
+
+    frame_center = torch.tensor([*frame_size]).type_as(tracks) / 2
+    tracks = tracks - frame_center
+
+    tracks = tracks / short_edge * 2
+
+    visibles = visibles * 2 - 1
+
+    trange = torch.linspace(-1, 1, tracks.shape[0]).view(-1, 1, 1, 1).expand(*visibles.shape)
+
+    out_ = torch.cat([trange, tracks, visibles], dim=-1).view(121, -1, 4)
+
+    out_0 = out_[:1]
+
+    out_l = out_[1:] # 121 => 120 | 1
+    a = 120 // math.gcd(120, num_frames)
+    b = num_frames // math.gcd(120, num_frames)
+    out_l = torch.repeat_interleave(out_l, b, dim=0)[1::a]  # 120 => 120 * b => 120 * b / a == F
+
+    final_result = torch.cat([out_0, out_l], dim=0)
+
+    return final_result
+
+FIXED_LENGTH = 121
+def pad_pts(tr):
+    """Convert list of {x,y} to (FIXED_LENGTH,1,3) array, padding/truncating."""
+    pts = np.array([[p['x'], p['y'], 1] for p in tr], dtype=np.float32)
+    n = pts.shape[0]
+    if n < FIXED_LENGTH:
+        pad = np.zeros((FIXED_LENGTH - n, 3), dtype=np.float32)
+        pts = np.vstack((pts, pad))
+    else:
+        pts = pts[:FIXED_LENGTH]
+    return pts.reshape(FIXED_LENGTH, 1, 3)
+
+def ind_sel(target: torch.Tensor, ind: torch.Tensor, dim: int = 1):
+    """Index selection utility function"""
+    assert (
+        len(ind.shape) > dim
+    ), "Index must have the target dim, but get dim: %d, ind shape: %s" % (dim, str(ind.shape))
+
+    target = target.expand(
+        *tuple(
+            [ind.shape[k] if target.shape[k] == 1 else -1 for k in range(dim)]
+            + [
+                -1,
+            ]
+            * (len(target.shape) - dim)
+        )
+    )
+
+    ind_pad = ind
+
+    if len(target.shape) > dim + 1:
+        for _ in range(len(target.shape) - (dim + 1)):
+            ind_pad = ind_pad.unsqueeze(-1)
+        ind_pad = ind_pad.expand(*(-1,) * (dim + 1), *target.shape[(dim + 1) : :])
+
+    return torch.gather(target, dim=dim, index=ind_pad)
+
+def merge_final(vert_attr: torch.Tensor, weight: torch.Tensor, vert_assign: torch.Tensor):
+    """Merge vertex attributes with weights"""
+    target_dim = len(vert_assign.shape) - 1
+    if len(vert_attr.shape) == 2:
+        assert vert_attr.shape[0] > vert_assign.max()
+        new_shape = [1] * target_dim + list(vert_attr.shape)
+        tensor = vert_attr.reshape(new_shape)
+        sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim)
+    else:
+        assert vert_attr.shape[1] > vert_assign.max()
+        new_shape = [vert_attr.shape[0]] + [1] * (target_dim - 1) + list(vert_attr.shape[1:])
+        tensor = vert_attr.reshape(new_shape)
+        sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim)
+
+    final_attr = torch.sum(sel_attr * weight.unsqueeze(-1), dim=-2)
+    return final_attr
+
+
+def _patch_motion_single(
+    tracks: torch.FloatTensor,  # (B, T, N, 4)
+    vid: torch.FloatTensor,     # (C, T, H, W)
+    temperature: float,
+    vae_divide: tuple,
+    topk: int,
+):
+    """Apply motion patching based on tracks"""
+    _, T, H, W = vid.shape
+    N = tracks.shape[2]
+    _, tracks_xy, visible = torch.split(
+        tracks, [1, 2, 1], dim=-1
+    )  # (B, T, N, 2) | (B, T, N, 1)
+    tracks_n = tracks_xy / torch.tensor([W / min(H, W), H / min(H, W)], device=tracks_xy.device)
+    tracks_n = tracks_n.clamp(-1, 1)
+    visible = visible.clamp(0, 1)
+
+    xx = torch.linspace(-W / min(H, W), W / min(H, W), W)
+    yy = torch.linspace(-H / min(H, W), H / min(H, W), H)
+
+    grid = torch.stack(torch.meshgrid(yy, xx, indexing="ij")[::-1], dim=-1).to(
+        tracks_xy.device
+    )
+
+    tracks_pad = tracks_xy[:, 1:]
+    visible_pad = visible[:, 1:]
+
+    visible_align = visible_pad.view(T - 1, 4, *visible_pad.shape[2:]).sum(1)
+    tracks_align = (tracks_pad * visible_pad).view(T - 1, 4, *tracks_pad.shape[2:]).sum(
+        1
+    ) / (visible_align + 1e-5)
+    dist_ = (
+        (tracks_align[:, None, None] - grid[None, :, :, None]).pow(2).sum(-1)
+    )  # T, H, W, N
+    weight = torch.exp(-dist_ * temperature) * visible_align.clamp(0, 1).view(
+        T - 1, 1, 1, N
+    )
+    vert_weight, vert_index = torch.topk(
+        weight, k=min(topk, weight.shape[-1]), dim=-1
+    )
+
+    grid_mode = "bilinear"
+    point_feature = torch.nn.functional.grid_sample(
+        vid.permute(1, 0, 2, 3)[:1],
+        tracks_n[:, :1].type(vid.dtype),
+        mode=grid_mode,
+        padding_mode="zeros",
+        align_corners=False,
+    )
+    point_feature = point_feature.squeeze(0).squeeze(1).permute(1, 0) # N, C=16
+
+    out_feature = merge_final(point_feature, vert_weight, vert_index).permute(3, 0, 1, 2) # T - 1, H, W, C => C, T - 1, H, W
+    out_weight = vert_weight.sum(-1) # T - 1, H, W
+
+    # out feature -> already soft weighted
+    mix_feature = out_feature + vid[:, 1:] * (1 - out_weight.clamp(0, 1))
+
+    out_feature_full = torch.cat([vid[:, :1], mix_feature], dim=1) # C, T, H, W
+    out_mask_full = torch.cat([torch.ones_like(out_weight[:1]), out_weight], dim=0)  # T, H, W
+
+    return out_mask_full[None].expand(vae_divide[0], -1, -1, -1), out_feature_full
+
+
+def patch_motion(
+    tracks: torch.FloatTensor,  # (B, TB, T, N, 4)
+    vid: torch.FloatTensor,     # (C, T, H, W)
+    temperature: float = 220.0,
+    vae_divide: tuple = (4, 16),
+    topk: int = 2,
+):
+    B = len(tracks)
+
+    # Process each batch separately
+    out_masks = []
+    out_features = []
+
+    for b in range(B):
+        mask, feature = _patch_motion_single(
+            tracks[b],  # (T, N, 4)
+            vid[b],        # (C, T, H, W)
+            temperature,
+            vae_divide,
+            topk
+        )
+        out_masks.append(mask)
+        out_features.append(feature)
+
+    # Stack results: (B, C, T, H, W)
+    out_mask_full = torch.stack(out_masks, dim=0)
+    out_feature_full = torch.stack(out_features, dim=0)
+
+    return out_mask_full, out_feature_full
+
+class WanTrackToVideo:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+                    "positive": ("CONDITIONING", ),
+                    "negative": ("CONDITIONING", ),
+                    "vae": ("VAE", ),
+                    "tracks": ("STRING", {"multiline": True, "default": "[]"}),
+                    "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                    "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                    "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                    "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                    "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}),
+                    "topk": ("INT", {"default": 2, "min": 1, "max": 10}),
+                    "start_image": ("IMAGE", ),
+                },
+                "optional": {
+                    "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
+                }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/video_models"
+
+    def encode(self, positive, negative, vae, tracks, width, height, length, batch_size,
+               temperature, topk, start_image=None, clip_vision_output=None):
+
+        tracks_data = parse_json_tracks(tracks)
+
+        if not tracks_data:
+            return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output)
+
+        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
+                           device=comfy.model_management.intermediate_device())
+
+        if isinstance(tracks_data[0][0], dict):
+            tracks_data = [tracks_data]
+
+        processed_tracks = []
+        for batch in tracks_data:
+            arrs = []
+            for track in batch:
+                pts = pad_pts(track)
+                arrs.append(pts)
+
+            tracks_np = np.stack(arrs, axis=0)
+            processed_tracks.append(process_tracks(tracks_np, (width, height), length - 1).unsqueeze(0))
+
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image[:batch_size].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            videos = torch.ones((start_image.shape[0], length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5
+            for i in range(start_image.shape[0]):
+                videos[i, 0] = start_image[i]
+
+            latent_videos = []
+            videos = comfy.utils.resize_to_batch_size(videos, batch_size)
+            for i in range(batch_size):
+                latent_videos += [vae.encode(videos[i, :, :, :, :3])]
+            y = torch.cat(latent_videos, dim=0)
+
+            # Scale latent since patch_motion is non-linear
+            y = comfy.latent_formats.Wan21().process_in(y)
+
+            processed_tracks = comfy.utils.resize_list_to_batch_size(processed_tracks, batch_size)
+            res = patch_motion(
+                processed_tracks, y, temperature=temperature, topk=topk, vae_divide=(4, 16)
+            )
+
+            mask, concat_latent_image = res
+            concat_latent_image = comfy.latent_formats.Wan21().process_out(concat_latent_image)
+            mask = -mask + 1.0  # Invert mask to match expected format
+            positive = node_helpers.conditioning_set_values(positive,
+                                                            {"concat_mask": mask,
+                                                            "concat_latent_image": concat_latent_image})
+            negative = node_helpers.conditioning_set_values(negative,
+                                                            {"concat_mask": mask,
+                                                            "concat_latent_image": concat_latent_image})
+
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return (positive, negative, out_latent)
+
 NODE_CLASS_MAPPINGS = {
+    "WanTrackToVideo": WanTrackToVideo,
     "WanImageToVideo": WanImageToVideo,
     "WanFunControlToVideo": WanFunControlToVideo,
     "WanFunInpaintToVideo": WanFunInpaintToVideo,

From e6e5d33b351fc5ed8334d74dac77b283ecea8708 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:58:28 -0700
Subject: [PATCH 23/24] Remove useless code. (#9041)

This is only needed on old pytorch 2.0 and older.
---
 comfy/ldm/wan/vae.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py
index a8ebc5ec6..a83c6edfd 100644
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@@ -52,15 +52,6 @@ class RMS_norm(nn.Module):
             x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0)
 
 
-class Upsample(nn.Upsample):
-
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-
-
 class Resample(nn.Module):
 
     def __init__(self, dim, mode):
@@ -73,11 +64,11 @@ class Resample(nn.Module):
         # layers
         if mode == 'upsample2d':
             self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                 ops.Conv2d(dim, dim // 2, 3, padding=1))
         elif mode == 'upsample3d':
             self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
                 ops.Conv2d(dim, dim // 2, 3, padding=1))
             self.time_conv = CausalConv3d(
                 dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))

From 93bc2f8e4d5dace2328b861579df24f91684e27e Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Sat, 26 Jul 2025 01:24:23 +0800
Subject: [PATCH 24/24] Update template to 0.1.40 (#9048)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8f6a6d112..33a59b4be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.23.4
-comfyui-workflow-templates==0.1.39
+comfyui-workflow-templates==0.1.40
 comfyui-embedded-docs==0.2.4
 torch
 torchsde