From 203849f6cce1cc8b246c051096c49c2a8e0e411c Mon Sep 17 00:00:00 2001
From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com>
Date: Fri, 24 Oct 2025 18:09:30 +0300
Subject: [PATCH] removed cuda graphs + changes to loudness.py

---
 comfy/autoregressive_sampling.py       | 47 +-------------------
 comfy/ldm/higgsv2/cuda_graph_runner.py | 59 --------------------------
 comfy/ldm/higgsv2/loudness.py          | 52 ++++++++---------------
 3 files changed, 19 insertions(+), 139 deletions(-)
 delete mode 100644 comfy/ldm/higgsv2/cuda_graph_runner.py

diff --git a/comfy/autoregressive_sampling.py b/comfy/autoregressive_sampling.py
index cde09a9f0..8cecd7428 100644
--- a/comfy/autoregressive_sampling.py
+++ b/comfy/autoregressive_sampling.py
@@ -9,40 +9,10 @@ from enum import Enum
 from dataclasses import dataclass, fields
 from transformers.cache_utils import StaticCache, DynamicCache, Cache
 from typing import Optional, Union, Any
-from comfy.model_management import get_free_memory, minimum_inference_memory
 import comfy.model_management
 
 NEED_SETUP_CACHE_CLASSES_MAPPING = { "static": StaticCache }
 
-def estimate_autoregressive_vram(
-    num_layers: int,
-    hidden_dim: int,
-    max_seq_len: int,
-    batch_size: int = 1,
-    dtype = torch.float16,
-    intermediate_factor: float = 4.0,
-    device = torch.device('cuda')
-) -> bool:
-
-    dtype_size = torch.finfo(dtype).bits // 8
-    kv_cache_bytes = num_layers * max_seq_len * hidden_dim * 2 * batch_size * dtype_size
-
-    # we only calculate hidden states in cuda graphs, so we don't care about the output logits
-    input_bytes = output_bytes = batch_size * max_seq_len * hidden_dim * dtype_size
-
-    # rough calculation for activation sizes
-    intermediate_bytes = intermediate_factor * output_bytes
-
-    total_estimated = kv_cache_bytes + input_bytes + output_bytes + intermediate_bytes
-
-    # get vram info
-    free_vram = get_free_memory(device)
-    minimum_vram = minimum_inference_memory()
-
-    enough_vram = free_vram - minimum_vram >= total_estimated
-
-    return enough_vram
-
 class TopKLogits:
     def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
 
@@ -282,21 +252,8 @@ class AutoRegressiveGeneration:
 
         } if self.model.cache_implementation == "static" and self.model.use_kv_buckets else None
 
-        enough_vram = estimate_autoregressive_vram(
-            self.model.num_hidden_layers, self.model.hidden_dim, self.model.max_seq_len, dtype = self.dtype, device = device
-        )
-
-        # cuda graphs only help if input shapes are constant
-        if (
-            device == "cuda"
-            and hasattr(model, "capture_model")
-            and self.model.cache_implementation == "static"
-            and self.model.use_kv_buckets
-            and enough_vram
-        ):
-            self.model.capture_model(self.kv_caches.values())
-        else:
-            self.model.generation_config.is_using_cuda_graphs = False
+        # for now
+        self.model.generation_config.is_using_cuda_graphs = False
 
     @torch.inference_mode()
     def generate(self, input_ids: Optional[torch.LongTensor] = None, max_new_length: int = 1024, min_new_length = 0,
diff --git a/comfy/ldm/higgsv2/cuda_graph_runner.py b/comfy/ldm/higgsv2/cuda_graph_runner.py
deleted file mode 100644
index e86f034c7..000000000
--- a/comfy/ldm/higgsv2/cuda_graph_runner.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import Optional, Dict
-import gc
-
-_NUM_WARMUP_ITERS = 2
-
-class CUDAGraphRunner(nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-        self.input_buffers: Dict[str, torch.Tensor] = {}
-        self.output_buffers: Dict[str, torch.Tensor] = {}
-
-        self._graph: Optional[torch.cuda.CUDAGraph] = None
-
-    @property
-    def graph(self):
-        assert self._graph is not None
-        return self._graph
-
-    def capture(self, *args, **kwargs):
-        assert self._graph is None
-
-        for _ in range(_NUM_WARMUP_ITERS):
-            self.model(*args, **kwargs)
-
-        torch.cuda.synchronize()
-
-        self._graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self._graph, pool = kwargs.get("memory_pool", None), stream = kwargs.get("stream", None)):
-            last_hidden_states = self.model(*args, **kwargs)
-            gc.collect()
-
-        torch.cuda.synchronize()
-
-        self.input_buffers = {
-            "args": [arg for arg in args if isinstance(arg, torch.Tensor)],
-            "kwargs": {k: v for k, v in kwargs.items() if isinstance(v, torch.Tensor)},
-        }
-
-        self.output_buffers = {
-            "hidden_states": last_hidden_states
-        }
-
-    def forward(self, *args, **kwargs):
-
-        for i, arg in enumerate(args):
-            if isinstance(arg, torch.Tensor):
-                self.input_buffers["args"][i].copy_(arg, non_blocking=True)
-
-        for k, v in kwargs.items():
-            if k in self.input_buffers["kwargs"] and isinstance(v, torch.Tensor):
-                self.input_buffers["kwargs"][k].copy_(v, non_blocking=True)
-
-        self.graph.replay()
-
-        return self.output_buffers["hidden_states"]
diff --git a/comfy/ldm/higgsv2/loudness.py b/comfy/ldm/higgsv2/loudness.py
index ac1850c61..5c5850c47 100644
--- a/comfy/ldm/higgsv2/loudness.py
+++ b/comfy/ldm/higgsv2/loudness.py
@@ -88,6 +88,7 @@ def fft_conv1d(
 class IIRfilter(object):
 
     def __init__(self, G, Q, fc, rate, filter_type, passband_gain=1.0):
+        G, Q, fc, rate = [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in (G, Q, fc, rate)]
         self.G  = G
         self.Q  = Q
         self.fc = fc
@@ -98,26 +99,26 @@ class IIRfilter(object):
     def generate_coefficients(self):
 
         A  = 10**(self.G/40.0)
-        w0 = 2.0 * np.pi * (self.fc / self.rate)
-        alpha = np.sin(w0) / (2.0 * self.Q)
+        w0 = 2.0 * torch.pi * (self.fc / self.rate)
+        alpha = torch.sin(w0) / (2.0 * self.Q)
 
         if self.filter_type == 'high_shelf':
-            b0 =      A * ( (A+1) + (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha )
-            b1 = -2 * A * ( (A-1) + (A+1) * np.cos(w0)                          )
-            b2 =      A * ( (A+1) + (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha )
-            a0 =            (A+1) - (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha
-            a1 =      2 * ( (A-1) - (A+1) * np.cos(w0)                          )
-            a2 =            (A+1) - (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha
+            b0 =      A * ( (A+1) + (A-1) * torch.cos(w0) + 2 * torch.sqrt(A) * alpha )
+            b1 = -2 * A * ( (A-1) + (A+1) * torch.cos(w0)                          )
+            b2 =      A * ( (A+1) + (A-1) * torch.cos(w0) - 2 * torch.sqrt(A) * alpha )
+            a0 =            (A+1) - (A-1) * torch.cos(w0) + 2 * torch.sqrt(A) * alpha
+            a1 =      2 * ( (A-1) - (A+1) * torch.cos(w0)                          )
+            a2 =            (A+1) - (A-1) * torch.cos(w0) - 2 * torch.sqrt(A) * alpha
 
         elif self.filter_type == 'high_pass':
-            b0 =  (1 + np.cos(w0))/2
-            b1 = -(1 + np.cos(w0))
-            b2 =  (1 + np.cos(w0))/2
+            b0 =  (1 + torch.cos(w0))/2
+            b1 = -(1 + torch.cos(w0))
+            b2 =  (1 + torch.cos(w0))/2
             a0 =   1 + alpha
-            a1 =  -2 * np.cos(w0)
+            a1 =  -2 * torch.cos(w0)
             a2 =   1 - alpha
 
-        return np.array([b0, b1, b2])/a0, np.array([a0, a1, a2])/a0
+        return torch.tensor([b0, b1, b2])/a0, torch.tensor([a0, a1, a2])/a0
 
     def apply_filter(self, data):
         return self.passband_gain * scipy.signal.lfilter(self.b, self.a, data)
@@ -160,14 +161,14 @@ class Meter(torch.nn.Module):
 
         for i, (_, filter_stage) in enumerate(self._filters.items()):
             b, a = filter_stage.b_and_a
-            firs[i] = scipy.signal.lfilter(b, a, impulse)
+            firs[i] = scipy.signal.lfilter(b.numpy(), a.numpy(), impulse)
 
         firs = torch.from_numpy(firs[..., ::-1].copy()).float()
 
         self.register_buffer("firs", firs)
         self.register_buffer("passband_gain", passband_gain)
 
-    def apply_filter_gpu(self, data: torch.Tensor):
+    def apply_filter_fir(self, data: torch.Tensor):
 
         # Data is of shape (nb, nch, nt)
         # Reshape to (nb*nch, 1, nt)
@@ -189,27 +190,8 @@ class Meter(torch.nn.Module):
         data = data[:, :nt, :]
         return data
 
-    def apply_filter_cpu(self, data: torch.Tensor):
-        for _, filter_stage in self._filters.items():
-            passband_gain = filter_stage.passband_gain
-            b, a = filter_stage.b_and_a
-
-            a_coeffs = torch.from_numpy(a).float().to(data.device)
-            b_coeffs = torch.from_numpy(b).float().to(data.device)
-
-            _data = data.permute(0, 2, 1)
-            filtered = torchaudio.functional.lfilter(
-                _data, a_coeffs, b_coeffs, clamp=False
-            )
-            data = passband_gain * filtered.permute(0, 2, 1)
-        return data
-
     def apply_filter(self, data: torch.Tensor):
-        if data.is_cuda or self.use_fir:
-            data = self.apply_filter_gpu(data)
-        else:
-            data = self.apply_filter_cpu(data)
-        return data
+        return self.apply_filter_fir(data)
 
     def forward(self, data: torch.Tensor):
         return self.integrated_loudness(data)