From 203849f6cce1cc8b246c051096c49c2a8e0e411c Mon Sep 17 00:00:00 2001 From: Yousef Rafat <81116377+yousef-rafat@users.noreply.github.com> Date: Fri, 24 Oct 2025 18:09:30 +0300 Subject: [PATCH] removed cuda graphs + changes to loudness.py --- comfy/autoregressive_sampling.py | 47 +------------------- comfy/ldm/higgsv2/cuda_graph_runner.py | 59 -------------------------- comfy/ldm/higgsv2/loudness.py | 52 ++++++++--------------- 3 files changed, 19 insertions(+), 139 deletions(-) delete mode 100644 comfy/ldm/higgsv2/cuda_graph_runner.py diff --git a/comfy/autoregressive_sampling.py b/comfy/autoregressive_sampling.py index cde09a9f0..8cecd7428 100644 --- a/comfy/autoregressive_sampling.py +++ b/comfy/autoregressive_sampling.py @@ -9,40 +9,10 @@ from enum import Enum from dataclasses import dataclass, fields from transformers.cache_utils import StaticCache, DynamicCache, Cache from typing import Optional, Union, Any -from comfy.model_management import get_free_memory, minimum_inference_memory import comfy.model_management NEED_SETUP_CACHE_CLASSES_MAPPING = { "static": StaticCache } -def estimate_autoregressive_vram( - num_layers: int, - hidden_dim: int, - max_seq_len: int, - batch_size: int = 1, - dtype = torch.float16, - intermediate_factor: float = 4.0, - device = torch.device('cuda') -) -> bool: - - dtype_size = torch.finfo(dtype).bits // 8 - kv_cache_bytes = num_layers * max_seq_len * hidden_dim * 2 * batch_size * dtype_size - - # we only calculate hidden states in cuda graphs, so we don't care about the output logits - input_bytes = output_bytes = batch_size * max_seq_len * hidden_dim * dtype_size - - # rough calculation for activation sizes - intermediate_bytes = intermediate_factor * output_bytes - - total_estimated = kv_cache_bytes + input_bytes + output_bytes + intermediate_bytes - - # get vram info - free_vram = get_free_memory(device) - minimum_vram = minimum_inference_memory() - - enough_vram = free_vram - minimum_vram >= total_estimated - - return enough_vram - class TopKLogits: def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): @@ -282,21 +252,8 @@ class AutoRegressiveGeneration: } if self.model.cache_implementation == "static" and self.model.use_kv_buckets else None - enough_vram = estimate_autoregressive_vram( - self.model.num_hidden_layers, self.model.hidden_dim, self.model.max_seq_len, dtype = self.dtype, device = device - ) - - # cuda graphs only help if input shapes are constant - if ( - device == "cuda" - and hasattr(model, "capture_model") - and self.model.cache_implementation == "static" - and self.model.use_kv_buckets - and enough_vram - ): - self.model.capture_model(self.kv_caches.values()) - else: - self.model.generation_config.is_using_cuda_graphs = False + # for now + self.model.generation_config.is_using_cuda_graphs = False @torch.inference_mode() def generate(self, input_ids: Optional[torch.LongTensor] = None, max_new_length: int = 1024, min_new_length = 0, diff --git a/comfy/ldm/higgsv2/cuda_graph_runner.py b/comfy/ldm/higgsv2/cuda_graph_runner.py deleted file mode 100644 index e86f034c7..000000000 --- a/comfy/ldm/higgsv2/cuda_graph_runner.py +++ /dev/null @@ -1,59 +0,0 @@ -import torch -import torch.nn as nn -from typing import Optional, Dict -import gc - -_NUM_WARMUP_ITERS = 2 - -class CUDAGraphRunner(nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} - - self._graph: Optional[torch.cuda.CUDAGraph] = None - - @property - def graph(self): - assert self._graph is not None - return self._graph - - def capture(self, *args, **kwargs): - assert self._graph is None - - for _ in range(_NUM_WARMUP_ITERS): - self.model(*args, **kwargs) - - torch.cuda.synchronize() - - self._graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self._graph, pool = kwargs.get("memory_pool", None), stream = kwargs.get("stream", None)): - last_hidden_states = self.model(*args, **kwargs) - gc.collect() - - torch.cuda.synchronize() - - self.input_buffers = { - "args": [arg for arg in args if isinstance(arg, torch.Tensor)], - "kwargs": {k: v for k, v in kwargs.items() if isinstance(v, torch.Tensor)}, - } - - self.output_buffers = { - "hidden_states": last_hidden_states - } - - def forward(self, *args, **kwargs): - - for i, arg in enumerate(args): - if isinstance(arg, torch.Tensor): - self.input_buffers["args"][i].copy_(arg, non_blocking=True) - - for k, v in kwargs.items(): - if k in self.input_buffers["kwargs"] and isinstance(v, torch.Tensor): - self.input_buffers["kwargs"][k].copy_(v, non_blocking=True) - - self.graph.replay() - - return self.output_buffers["hidden_states"] diff --git a/comfy/ldm/higgsv2/loudness.py b/comfy/ldm/higgsv2/loudness.py index ac1850c61..5c5850c47 100644 --- a/comfy/ldm/higgsv2/loudness.py +++ b/comfy/ldm/higgsv2/loudness.py @@ -88,6 +88,7 @@ def fft_conv1d( class IIRfilter(object): def __init__(self, G, Q, fc, rate, filter_type, passband_gain=1.0): + G, Q, fc, rate = [t if isinstance(t, torch.Tensor) else torch.as_tensor(t) for t in (G, Q, fc, rate)] self.G = G self.Q = Q self.fc = fc @@ -98,26 +99,26 @@ class IIRfilter(object): def generate_coefficients(self): A = 10**(self.G/40.0) - w0 = 2.0 * np.pi * (self.fc / self.rate) - alpha = np.sin(w0) / (2.0 * self.Q) + w0 = 2.0 * torch.pi * (self.fc / self.rate) + alpha = torch.sin(w0) / (2.0 * self.Q) if self.filter_type == 'high_shelf': - b0 = A * ( (A+1) + (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha ) - b1 = -2 * A * ( (A-1) + (A+1) * np.cos(w0) ) - b2 = A * ( (A+1) + (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha ) - a0 = (A+1) - (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha - a1 = 2 * ( (A-1) - (A+1) * np.cos(w0) ) - a2 = (A+1) - (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha + b0 = A * ( (A+1) + (A-1) * torch.cos(w0) + 2 * torch.sqrt(A) * alpha ) + b1 = -2 * A * ( (A-1) + (A+1) * torch.cos(w0) ) + b2 = A * ( (A+1) + (A-1) * torch.cos(w0) - 2 * torch.sqrt(A) * alpha ) + a0 = (A+1) - (A-1) * torch.cos(w0) + 2 * torch.sqrt(A) * alpha + a1 = 2 * ( (A-1) - (A+1) * torch.cos(w0) ) + a2 = (A+1) - (A-1) * torch.cos(w0) - 2 * torch.sqrt(A) * alpha elif self.filter_type == 'high_pass': - b0 = (1 + np.cos(w0))/2 - b1 = -(1 + np.cos(w0)) - b2 = (1 + np.cos(w0))/2 + b0 = (1 + torch.cos(w0))/2 + b1 = -(1 + torch.cos(w0)) + b2 = (1 + torch.cos(w0))/2 a0 = 1 + alpha - a1 = -2 * np.cos(w0) + a1 = -2 * torch.cos(w0) a2 = 1 - alpha - return np.array([b0, b1, b2])/a0, np.array([a0, a1, a2])/a0 + return torch.tensor([b0, b1, b2])/a0, torch.tensor([a0, a1, a2])/a0 def apply_filter(self, data): return self.passband_gain * scipy.signal.lfilter(self.b, self.a, data) @@ -160,14 +161,14 @@ class Meter(torch.nn.Module): for i, (_, filter_stage) in enumerate(self._filters.items()): b, a = filter_stage.b_and_a - firs[i] = scipy.signal.lfilter(b, a, impulse) + firs[i] = scipy.signal.lfilter(b.numpy(), a.numpy(), impulse) firs = torch.from_numpy(firs[..., ::-1].copy()).float() self.register_buffer("firs", firs) self.register_buffer("passband_gain", passband_gain) - def apply_filter_gpu(self, data: torch.Tensor): + def apply_filter_fir(self, data: torch.Tensor): # Data is of shape (nb, nch, nt) # Reshape to (nb*nch, 1, nt) @@ -189,27 +190,8 @@ class Meter(torch.nn.Module): data = data[:, :nt, :] return data - def apply_filter_cpu(self, data: torch.Tensor): - for _, filter_stage in self._filters.items(): - passband_gain = filter_stage.passband_gain - b, a = filter_stage.b_and_a - - a_coeffs = torch.from_numpy(a).float().to(data.device) - b_coeffs = torch.from_numpy(b).float().to(data.device) - - _data = data.permute(0, 2, 1) - filtered = torchaudio.functional.lfilter( - _data, a_coeffs, b_coeffs, clamp=False - ) - data = passband_gain * filtered.permute(0, 2, 1) - return data - def apply_filter(self, data: torch.Tensor): - if data.is_cuda or self.use_fir: - data = self.apply_filter_gpu(data) - else: - data = self.apply_filter_cpu(data) - return data + return self.apply_filter_fir(data) def forward(self, data: torch.Tensor): return self.integrated_loudness(data)