init

2026-01-27 14:50:20 +08:00 · 2025-09-05 23:47:56 +03:00 · 2025-09-05 23:47:56 +03:00 · 254622d7c6
commit 254622d7c6
parent 27870ec3c3
19 changed files with 1260506 additions and 9 deletions
--- a/comfy/autoregressive_sampling.py
+++ b/comfy/autoregressive_sampling.py
@ -0,0 +1,632 @@
 from __future__ import annotations
 import copy
 import torch
 import inspect
 import warnings
 from enum import Enum
 from dataclasses import dataclass, fields
 from transformers.cache_utils import StaticCache, DynamicCache, Cache
 from typing import Optional, Union, Any
 from comfy.model_management import get_free_memory, minimum_inference_memory
 import comfy.model_management
 NEED_SETUP_CACHE_CLASSES_MAPPING = { "static": StaticCache }
 def estimate_autoregressive_vram(
    num_layers: int,
    hidden_dim: int,
    max_seq_len: int,
    batch_size: int = 1,
    dtype = torch.float16,
    intermediate_factor: float = 4.0, 
    device = torch.device('cuda')
 ) -> bool:
    dtype_size = torch.finfo(dtype).bits // 8
    kv_cache_bytes = num_layers * max_seq_len * hidden_dim * 2 * batch_size * dtype_size
    # we only calculate hidden states in cuda graphs, so we don't care about the output logits    
    input_bytes = output_bytes = batch_size * max_seq_len * hidden_dim * dtype_size    
    # rough calculation for activation sizes
    intermediate_bytes = intermediate_factor * output_bytes
    total_estimated = kv_cache_bytes + input_bytes + output_bytes + intermediate_bytes
    # get vram info
    free_vram = get_free_memory(device)
    minimum_vram = minimum_inference_memory()
    enough_vram = free_vram - minimum_vram >= total_estimated
    return enough_vram
 class TopKLogits:
    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        self.top_k = max(top_k, min_tokens_to_keep)
        self.filter_value = filter_value
    def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
        top_k = min(self.top_k, scores.size(-1)) # cap top_k with vocab_size
        indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
        return scores_processed
 class TemperatureLogitsWarper:
    def __init__(self, temperature: float):
        if not (temperature > 0):
            raise ValueError(f"`temperature` (={temperature}) must be positive temperature > 0")
        self.temperature = temperature
    def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
        scores_processed = scores / self.temperature
        return scores_processed
 class TopPLogitsWarper:
    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        top_p = float(top_p)
        if top_p < 0 or top_p > 1.0:
            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
        self.top_p = top_p
        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep
    def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
        sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
        return scores_processed
 def get_logits_processing(config: GenerationConfig):
    # TODO: add support for beam search with diversity penalty
    logits_processors = []
    if config.top_k is not None and config.top_k != 0:
        logits_processors.append(TopKLogits(config.top_k))
    if config.temperature is not None and config.temperature != 1:
        logits_processors.append(TemperatureLogitsWarper(config.temperature))
    if config.top_p is not None and config.top_p != 1:
        logits_processors.append(TopPLogitsWarper(config.top_p))
    return logits_processors
 def apply_logits_processing(logits, logits_processing_list, **kwargs):
    for process in logits_processing_list:
        func_args = inspect.signature(process.__call__).parameters
        if not all(arg in kwargs for arg in list(func_args.keys())[1:]):
            raise ValueError(
                f"Make sure that all the required parameters: {list(func_args.keys())} for "
                f"{process.__class__} are passed to the logits processor."
            )
        logits = process(logits, **kwargs)
    return logits
 def check_stopping_criteria(input_ids: torch.Tensor, max_length: int, eos_token):
    if not isinstance(eos_token, torch.Tensor):
        eos_token = torch.tensor(eos_token, device=input_ids.device)
    max_len_done = input_ids.shape[1] >= max_length
    eos_done = torch.isin(input_ids[:, -1], eos_token)
    # finished either by lenght or eos
    finished_mask = max_len_done | eos_done
    unfinished_mask = ~finished_mask
    return finished_mask, unfinished_mask
 class GenerationSampling(Enum):
    GREEDY_SEARCH = "greedy_search"
    BEAM_SAMPLING = "beam_sample"
@dataclass
 class GenerationConfig:
    pad_token_id: int = None
    bos_token_id: int = None
    eos_token_id: int = None
    top_k: int = None
    top_p: int = None
    do_sample: bool = None # TODO: add beam sampling logic
    use_cache: bool = True
    num_beams: int = 1
    temperature: float = 1.0
    max_new_length: int = 1024
    min_new_length: int = 0
    num_return_sequences: int = 1
    generation_kwargs = {}
    cache_implementation: str = "static"
    is_using_cuda_graphs: bool = True
    # special values, should be touched only in generation (no config)
    max_length = None
    min_length = None
    _bos_token_tensor = None
    _eos_token_tensor = None
    _pad_token_tensor = None
    def update(self, **kwargs):
        to_remove = []
        for key, value in kwargs.items():
            if hasattr(self, key):
                setattr(self, key, value)
                to_remove.append(key)
        # TODO in case of scaling for beam sampling: add a validation for the user's config
        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
        return unused_kwargs
    @classmethod
    def from_model_config(cls, config_dict: dict, **kwargs) -> GenerationConfig:
        config_dict = {key: value for key, value in config_dict.items() if value is not None}
        valid_fields = {f.name for f in fields(cls)}
        filtered_args = {k: v for k, v in {**config_dict, **kwargs}.items() if k in valid_fields}
        generation_config = cls(**filtered_args)
        return generation_config
@dataclass
 class CacheConfig:
    hidden_size: int
    num_attention_heads: int
    num_key_value_heads: int
    head_dim: int
    intermediate_size: int
    num_hidden_layers: int
    max_batch: int = 1
    def get_text_config(self):
        return self
 class AutoRegressiveGeneration:
    def __init__(self, model, device, kv_cache_lengths: list = [1024, 4096, 8192]):
        self.device = device
        self.dtype = model.dtype
        self.model = model
        text_config = self.model.cache_config
        self.cache_config = CacheConfig(
            hidden_size = text_config["hidden_size"],
            num_attention_heads = text_config["num_attention_heads"],
            num_key_value_heads = text_config["num_key_value_heads"],
            head_dim = text_config["head_dim"],
            intermediate_size = text_config["intermediate_size"],
            num_hidden_layers = self.model.num_hidden_layers or text_config["num_hidden_layers"],
        )
        self.model.cache_config = self.cache_config
        self.kv_caches = {
            length: StaticCache(
                config=self.cache_config,
                max_batch_size = self.cache_config.max_batch,
                max_cache_len=length,
                device=self.model.device,
                dtype=self.model.dtype,
            )
            for length in sorted(kv_cache_lengths)
        } if self.model.cache_implementation == "static" and self.model.use_kv_buckets else None
        enough_vram = estimate_autoregressive_vram(
            self.model.num_hidden_layers, self.model.hidden_dim, self.model.max_seq_len, dtype = self.dtype, device = device
        )
        # cuda graphs only help if input shapes are constant
        if (
            device == "cuda" 
            and hasattr(model, "capture_model") 
            and self.model.cache_implementation == "static"
            and self.model.use_kv_buckets
            and enough_vram
        ):
            self.model.capture_model(self.kv_caches.values())
        else:
            self.model.generation_config.is_using_cuda_graphs = False
    @torch.inference_mode()
    def generate(self, input_ids: Optional[torch.LongTensor] = None, max_new_length: int = 1024, min_new_length = 0,
                 top_k: int = 50, top_p: float = 1.0, temperature: float = 1.0, do_sample: bool = False, seed = None, **kwargs):
        if seed is not None:
            torch_generator = torch.Generator(device = input_ids.device).manual_seed(seed)
        else:
            torch_generator = None
        kwargs["torch_generator"] = torch_generator
        kwargs["pbar"] = comfy.utils.ProgressBar(max_new_length)
        gen_kwargs = dict(
            max_new_length = max_new_length,
            min_new_length = min_new_length,
            top_k = top_k,
            top_p = top_p,
            temperature = temperature,
            do_sample = do_sample
        )
        # in case the specific model requires special handling
        generate_fn = getattr(self.model, "generate", None)
        if generate_fn is not None:
            generation_config = generate_fn(input_ids, generation_functions = self, **kwargs)
            generation_config.update(**gen_kwargs)
        if generation_config is None:
            generation_config = GenerationConfig(max_new_length = max_new_length,
                                                min_new_length = min_new_length,
                                                top_k =  top_k,
                                                top_p = top_p,
                                                do_sample = do_sample,
                                                temperature = temperature)
        generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
        generation_config, model_kwargs = self._prepare_generation_config(
            generation_config, **kwargs
        )
        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.model.forward).parameters.keys())
        requires_attention_mask = "encoder_outputs" not in model_kwargs
        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
        inputs_tensor = input_ids
        model_input_name = "input_ids"
        batch_size = inputs_tensor.shape[0]
        device = inputs_tensor.device
        self._prepare_special_tokens(generation_config, device = device)
        if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
                inputs_tensor, generation_config, model_kwargs
            )
        elif kwargs_has_attention_mask:
            if model_input_name == "input_ids" and len(model_kwargs["attention_mask"].shape) > 2:
                raise ValueError("`attention_mask` passed to `generate` must be 2D.")
        input_ids_length = input_ids.shape[1]
        generation_config = self._prepare_generated_length(
            generation_config=generation_config,
            input_ids_length=input_ids_length,
        )
        self._validate_generated_length(generation_config, input_ids_length)
        max_cache_length = generation_config.max_length - 1
        self._prepare_cache_for_generation(
            generation_config, model_kwargs, batch_size, max_cache_length, device
        )
        generation_mode = self.get_generation_mode(generation_config)
        prepared_logits_processor = get_logits_processing(generation_config)
        model_kwargs["use_cache"] = generation_config.use_cache
        if generation_mode == GenerationSampling.GREEDY_SEARCH:
            input_ids, model_kwargs = self._expand_inputs_for_generation(
                input_ids = input_ids,
                expand_size = generation_config.num_return_sequences,
                **model_kwargs,
            )
            # TODO: have a default self._sample fn and a default check if the model supports autoregGen or not
            if not hasattr(self.model, "_sample"):
                raise ValueError("Model doesn't support AutoRegressive Generation!")
            self._prepare_kv_caches()
            result = self.model._sample(
                input_ids,
                logits_processing_list = prepared_logits_processor,
                generation_config = generation_config,
                past_key_values_buckets = self.kv_caches,
                **model_kwargs,
            )
        return result
    def _prepare_kv_caches(self):
        for kv_cache in self.kv_caches.values():
            kv_cache.reset()
    def get_generation_mode(self, config: GenerationConfig):
        if config.num_beams > 1:
            return GenerationSampling.BEAM_SAMPLING
        else:
            return GenerationSampling.GREEDY_SEARCH
    def _prepare_generated_length(
        self,
        generation_config: GenerationConfig,
        input_ids_length,
    ):
        """ max_length = user_input_id_tokens + generation_max_length """
        if generation_config.max_new_length is not None:
            generation_config.max_length = generation_config.max_new_length + input_ids_length
        # same for max length
        if generation_config.min_new_length is not None:
            generation_config.min_length = generation_config.min_new_length + input_ids_length
        return generation_config
    def _get_cache(
        self, cache_implementation: str, batch_size: int, max_cache_len: int, device: torch.device, model_kwargs
    ) -> Cache:
        assert cache_implementation == "static", f"Only 'static' cache is supported, got {cache_implementation}"
        cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
        if hasattr(self.model, "_cache"):
            cache_to_check = self.model._cache
        else:
            cache_to_check = None
        need_new_cache = (
            not hasattr(self.model, "_cache")
            or (not isinstance(cache_to_check, cache_cls))
            or (self.cache_config.max_batch != batch_size)
            or (cache_to_check.max_cache_len < max_cache_len)
        )
        if need_new_cache:
            cache_dtype = getattr(self.model.config, "_pre_quantization_dtype", self.dtype)
            cache_kwargs = {
                "config": self.cache_config,
                "max_batch_size": batch_size,
                "max_cache_len": max_cache_len,
                "dtype": cache_dtype,
                "device": device,
                "layer_device_map": None,
            }
            self.model._cache = cache_cls(**cache_kwargs)
        else:
            self.model._cache.reset()
        return self.model._cache
    def _prepare_cache_for_generation(
        self,
        generation_config: GenerationConfig,
        model_kwargs: dict,
        batch_size: int,
        max_cache_length: int,
        device: torch.device,
    ) -> bool:
        cache_name = "past_key_values"
        if generation_config.use_cache is False:
            return
        if generation_config.cache_implementation is not None:
            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
                model_kwargs[cache_name] = self._get_cache(
                    cache_implementation=generation_config.cache_implementation,
                    batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
                    max_cache_len=max_cache_length,
                    device=device,
                    model_kwargs=model_kwargs,
                )
            elif generation_config.cache_implementation == "dynamic":
                model_kwargs[cache_name] = DynamicCache()
    def _prepare_generation_config(self, generation_config: GenerationConfig, **kwargs: dict) -> tuple[GenerationConfig, dict]:
        generation_config = copy.deepcopy(generation_config)
        modified_values = {}
        global_default_generation_config = GenerationConfig()
        model_generation_config = self.model.generation_config
        for key, model_gen_config_value in model_generation_config.__dict__.items():
            if key.startswith("_"):
                continue
            global_default_value = getattr(global_default_generation_config, key, None)
            custom_gen_config_value = getattr(generation_config, key, None)
            if (
                custom_gen_config_value == global_default_value
                and model_gen_config_value != global_default_value
            ):
                modified_values[key] = model_gen_config_value
                setattr(generation_config, key, model_gen_config_value)
        if generation_config.temperature == 0.0:
            generation_config.do_sample = False
        model_kwargs = generation_config.update(**kwargs)
        return generation_config, model_kwargs
    def _validate_generated_length(self, generation_config: GenerationConfig, input_ids_length):
        """Performs validation related to the resulting generated length"""
        if input_ids_length >= generation_config.max_length:
            input_ids_string = "input_ids"
            raise ValueError(
                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
                " increasing `max_length` or, better yet, setting `max_new_tokens`."
            )
        min_length_error_suffix = (
            " Generation will stop at the defined maximum length. You should decrease the minimum length and/or "
            "increase the maximum length."
        )
        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
            warnings.warn(
                f"Unfeasible length constraints: `min_length` ({generation_config.min_length}) is larger than"
                f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
                UserWarning,
            )
        if generation_config.min_new_length is not None:
            min_length = generation_config.min_new_length + input_ids_length
            if min_length > generation_config.max_length:
                warnings.warn(
                    f"Unfeasible length constraints: `min_new_length` ({generation_config.min_new_length}), when "
                    f"added to the prompt length ({input_ids_length}), is larger than"
                    f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
                    UserWarning,
                )
    def _expand_inputs_for_generation(
        self,
        expand_size: int = 1,
        input_ids: Optional[torch.LongTensor] = None,
        **model_kwargs,
    ) -> tuple[torch.LongTensor, dict[str, Any]]:
        """ Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] """
        if expand_size == 1:
            return input_ids, model_kwargs
        def _expand_dict_for_generation(dict_to_expand):
            for key in dict_to_expand:
                if (
                    key != "cache_position"
                    and dict_to_expand[key] is not None
                    and isinstance(dict_to_expand[key], torch.Tensor)
                ):
                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
            return dict_to_expand
        if input_ids is not None:
            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
        model_kwargs = _expand_dict_for_generation(model_kwargs)
        return input_ids, model_kwargs
    def _prepare_special_tokens(
        self,
        generation_config: GenerationConfig,
        device: Optional[Union[torch.device, str]] = None,
    ):
        def _tensor_or_none(token, device=None):
            if token is None:
                return token
            device = device if device is not None else self.device
            if isinstance(token, torch.Tensor):
                return token.to(device)
            return torch.tensor(token, device=device, dtype=torch.long)
        bos_token_tensor = _tensor_or_none(generation_config.bos_token_id, device=device)
        eos_token_tensor = _tensor_or_none(generation_config.eos_token_id, device=device)
        pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
        # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
        if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
            eos_token_tensor = eos_token_tensor.unsqueeze(0)
        if pad_token_tensor is None and eos_token_tensor is not None:
            pad_token_tensor = eos_token_tensor[0]
        # to avoid problems with cuda graphs
        generation_config._bos_token_tensor = bos_token_tensor
        generation_config._eos_token_tensor = eos_token_tensor
        generation_config._pad_token_tensor = pad_token_tensor
    def _prepare_attention_mask_for_generation(
        self,
        inputs_tensor: torch.Tensor,
        generation_config: GenerationConfig,
        model_kwargs: dict[str, Any],
    ) -> torch.LongTensor:
        pad_token_id = generation_config._pad_token_tensor
        eos_token_id = generation_config._eos_token_tensor
        if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0:
            inputs_tensor = model_kwargs["input_ids"]
        # No information for attention mask inference -> return default attention mask
        default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device)
        if pad_token_id is None:
            return default_attention_mask
        is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long]
        if not is_input_ids:
            return default_attention_mask
        is_pad_token_in_inputs = (pad_token_id is not None) and (
            torch.isin(elements=inputs_tensor, test_elements=pad_token_id).any()
        )
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
            torch.isin(elements=eos_token_id, test_elements=pad_token_id).any()
        )
        can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
        attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long()
        attention_mask = (
            attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
        )
        return attention_mask
 def auto_sample(node, patcher, input_ids, max_new_length=1024, min_new_length=0, top_k=50, top_p=1.0, temperature=1.0, do_sample = False, seed=None, **kwargs):
    # to work with BaseModel
    if hasattr(patcher, "model") and hasattr(patcher.model, "diffusion_model"):
        model = patcher.model.diffusion_model
    if node._cached_autoregressive_sampler is None or node._cached_autoregressive_sampler.model is not model:
        if model.device != patcher.load_device:
            model = model.to(patcher.load_device, dtype=model.dtype)
            model.device = patcher.load_device
        node._cached_autoregressive_sampler = AutoRegressiveGeneration(model, device = patcher.load_device)
    if isinstance(input_ids, dict):
        main_input_ids = input_ids.get("input_ids")
        kwargs.update({k: v for k, v in input_ids.items() if k != "input_ids"})
    else:
        main_input_ids = input_ids
    device = node._cached_autoregressive_sampler.device
    main_input_ids = main_input_ids.to(device)
    for k, v in kwargs.items():
        if isinstance(v, torch.Tensor):
            kwargs[k] = v.to(device)
    # for tokenizers.Encoding output
    if not isinstance(main_input_ids, torch.Tensor):
        kwargs["attention_mask"] = torch.tensor(main_input_ids["attention_mask"])
        main_input_ids = torch.tensor(main_input_ids["input_ids"])
    samples = node._cached_autoregressive_sampler.generate(main_input_ids, max_new_length, min_new_length, top_k, top_p, temperature, do_sample, seed=seed, **kwargs)
    if isinstance(samples, list):
        samples = [sample.to(comfy.model_management.intermediate_device()) for sample in samples]
    else:
        samples = samples.to(comfy.model_management.intermediate_device())
    return samples
--- a/comfy/ldm/higgsv2/cuda_graph_runner.py
+++ b/comfy/ldm/higgsv2/cuda_graph_runner.py
@ -0,0 +1,59 @@
 import torch
 import torch.nn as nn
 from typing import Optional, Dict
 import gc
 _NUM_WARMUP_ITERS = 2
 class CUDAGraphRunner(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.input_buffers: Dict[str, torch.Tensor] = {}
        self.output_buffers: Dict[str, torch.Tensor] = {}
        self._graph: Optional[torch.cuda.CUDAGraph] = None
    @property
    def graph(self):
        assert self._graph is not None
        return self._graph
    def capture(self, *args, **kwargs):
        assert self._graph is None
        for _ in range(_NUM_WARMUP_ITERS):
            self.model(*args, **kwargs)
        torch.cuda.synchronize()
        self._graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(self._graph, pool = kwargs.get("memory_pool", None), stream = kwargs.get("stream", None)):
            last_hidden_states = self.model(*args, **kwargs)
            gc.collect()
        torch.cuda.synchronize()
        self.input_buffers = {
            "args": [arg for arg in args if isinstance(arg, torch.Tensor)],
            "kwargs": {k: v for k, v in kwargs.items() if isinstance(v, torch.Tensor)},
        }
        self.output_buffers = {
            "hidden_states": last_hidden_states
        }
    def forward(self, *args, **kwargs):
        for i, arg in enumerate(args):
            if isinstance(arg, torch.Tensor):
                self.input_buffers["args"][i].copy_(arg, non_blocking=True)
        for k, v in kwargs.items():
            if k in self.input_buffers["kwargs"] and isinstance(v, torch.Tensor):
                self.input_buffers["kwargs"][k].copy_(v, non_blocking=True)
        self.graph.replay()
        return self.output_buffers["hidden_states"]
--- a/comfy/ldm/higgsv2/loudness.py
+++ b/comfy/ldm/higgsv2/loudness.py
@ -0,0 +1,330 @@
 import copy
 import math
 import torch
 import scipy
 import torchaudio
 import numpy as np
 import torch.nn.functional as F
 from typing import Optional, List
 # defaulted to the new pytorch api
 def _new_rfft(x: torch.Tensor):
    z = torch.fft.rfft(x, dim=-1)
    return torch.view_as_real(z)
 def _new_irfft(x: torch.Tensor, length: int):
    x = torch.view_as_complex(x)
    return torch.fft.irfft(x, length, dim=-1)
 def _compl_mul_conjugate(a: torch.Tensor, b: torch.Tensor):
    # changed this function to use the pytorch api
    return torch.view_as_real(torch.view_as_complex(a) * torch.view_as_complex(b).conj())
 def unfold(input, kernel_size: int, stride: int):
    shape = list(input.shape)
    length = shape.pop(-1)
    n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
    tgt_length = (n_frames - 1) * stride + kernel_size
    padded = F.pad(input, (0, tgt_length - length)).contiguous()
    strides: List[int] = []
    for dim in range(padded.dim()):
        strides.append(padded.stride(dim))
    last_stride = strides.pop(-1)
    assert last_stride == 1, 'data should be contiguous'
    strides = strides + [stride, 1]
    return padded.as_strided(shape + [n_frames, kernel_size], strides)
 # convert the signal and filter to frequency domain, multiply them, then inverse FFT to get back to time-domain
 # faster than a sliding window over time-domain.
 def fft_conv1d(
        input: torch.Tensor, weight: torch.Tensor,
        bias: Optional[torch.Tensor] = None, stride: int = 1, padding: int = 0,
        block_ratio: float = 5):
    input = F.pad(input, (padding, padding))
    batch, _, length = input.shape
    out_channels, _, kernel_size = weight.shape
    _rfft = _new_rfft
    _irfft = _new_irfft
    if length < kernel_size:
        raise RuntimeError(f"Input should be at least as large as the kernel size {kernel_size}, "
                           f"but it is only {length} samples long.")
    if block_ratio < 1:
        raise RuntimeError("Block ratio must be greater than 1.")
    # We are going to process the input blocks by blocks, as for some reason it is faster
    # and less memory intensive (I think the culprit is `torch.einsum`.
    block_size: int = min(int(kernel_size * block_ratio), length)
    fold_stride = block_size - kernel_size + 1
    # replaces to_pad
    weight = F.pad(weight, (0, block_size - weight.shape[-1]), mode = "constant", value = 0)
    weight_z = _rfft(weight)
    # We pad the input and get the different frames, on which
    frames = unfold(input, block_size, fold_stride)
    frames_z = _rfft(frames)
    out_z = _compl_mul_conjugate(frames_z, weight_z)
    out = _irfft(out_z, block_size)
    # The last bit is invalid, because FFT will do a circular convolution.
    out = out[..., :-kernel_size + 1]
    out = out.reshape(batch, out_channels, -1)
    out = out[..., ::stride]
    target_length = (length - kernel_size) // stride + 1
    out = out[..., :target_length]
    if bias is not None:
        out += bias[:, None]
    return out
 class IIRfilter(object):
    def __init__(self, G, Q, fc, rate, filter_type, passband_gain=1.0):
        self.G  = G
        self.Q  = Q
        self.fc = fc
        self.rate = rate
        self.filter_type = filter_type
        self.passband_gain = passband_gain
    def generate_coefficients(self):
        A  = 10**(self.G/40.0)
        w0 = 2.0 * np.pi * (self.fc / self.rate)
        alpha = np.sin(w0) / (2.0 * self.Q)
        if self.filter_type == 'high_shelf':
            b0 =      A * ( (A+1) + (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha )
            b1 = -2 * A * ( (A-1) + (A+1) * np.cos(w0)                          )
            b2 =      A * ( (A+1) + (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha )
            a0 =            (A+1) - (A-1) * np.cos(w0) + 2 * np.sqrt(A) * alpha
            a1 =      2 * ( (A-1) - (A+1) * np.cos(w0)                          )
            a2 =            (A+1) - (A-1) * np.cos(w0) - 2 * np.sqrt(A) * alpha
        elif self.filter_type == 'high_pass':
            b0 =  (1 + np.cos(w0))/2
            b1 = -(1 + np.cos(w0))
            b2 =  (1 + np.cos(w0))/2
            a0 =   1 + alpha
            a1 =  -2 * np.cos(w0)
            a2 =   1 - alpha
        return np.array([b0, b1, b2])/a0, np.array([a0, a1, a2])/a0
    def apply_filter(self, data):
        return self.passband_gain * scipy.signal.lfilter(self.b, self.a, data)
    @property
    def b_and_a(self):
        return self.generate_coefficients()
 class Meter(torch.nn.Module):
    def __init__(
        self,
        rate: int,
        filter_class: str = "K-weighting",
        block_size: float = 0.400,
        zeros: int = 512,
        use_fir: bool = False,
    ):
        super().__init__()
        self.rate = rate
        self.filter_class = filter_class
        self.block_size = block_size
        self.use_fir = use_fir
        G = torch.from_numpy(np.array([1.0, 1.0, 1.0, 1.41, 1.41]))
        self.register_buffer("G", G)
        self._filters = {}
        self._filters['high_shelf'] = IIRfilter(4.0, 1/np.sqrt(2), 1500.0, self.rate, 'high_shelf')
        self._filters['high_pass'] = IIRfilter(0.0, 0.5, 38.0, self.rate, 'high_pass')
        # Compute impulse responses so that filtering is fast via
        # a convolution at runtime, on GPU, unlike lfilter.
        impulse = np.zeros((zeros,))
        impulse[..., 0] = 1.0
        firs = np.zeros((len(self._filters), 1, zeros))
        passband_gain = torch.tensor([filter.passband_gain for filter in self._filters.values()])
        for i, (_, filter_stage) in enumerate(self._filters.items()):
            b, a = filter_stage.b_and_a
            firs[i] = scipy.signal.lfilter(b, a, impulse)
        firs = torch.from_numpy(firs[..., ::-1].copy()).float()
        self.register_buffer("firs", firs)
        self.register_buffer("passband_gain", passband_gain)
    def apply_filter_gpu(self, data: torch.Tensor):
        # Data is of shape (nb, nch, nt)
        # Reshape to (nb*nch, 1, nt)
        nb, nt, nch = data.shape
        data = data.permute(0, 2, 1)
        data = data.reshape(nb * nch, 1, nt)
        # Apply padding
        pad_length = self.firs.shape[-1]
        # Apply filtering in sequence
        for i in range(self.firs.shape[0]):
            data = F.pad(data, (pad_length, pad_length))
            data = fft_conv1d(data, self.firs[i, None, ...])
            data = self.passband_gain[i] * data
            data = data[..., 1 : nt + 1]
        data = data.permute(0, 2, 1)
        data = data[:, :nt, :]
        return data
    def apply_filter_cpu(self, data: torch.Tensor):
        for _, filter_stage in self._filters.items():
            passband_gain = filter_stage.passband_gain
            b, a = filter_stage.b_and_a
            a_coeffs = torch.from_numpy(a).float().to(data.device)
            b_coeffs = torch.from_numpy(b).float().to(data.device)
            _data = data.permute(0, 2, 1)
            filtered = torchaudio.functional.lfilter(
                _data, a_coeffs, b_coeffs, clamp=False
            )
            data = passband_gain * filtered.permute(0, 2, 1)
        return data
    def apply_filter(self, data: torch.Tensor):
        if data.is_cuda or self.use_fir:
            data = self.apply_filter_gpu(data)
        else:
            data = self.apply_filter_cpu(data)
        return data
    def forward(self, data: torch.Tensor):
        return self.integrated_loudness(data)
    def _unfold(self, input_data):
        T_g = self.block_size
        overlap = 0.75  # overlap of 75% of the block duration
        step = 1.0 - overlap  # step size by percentage
        kernel_size = int(T_g * self.rate)
        stride = int(T_g * self.rate * step)
        unfolded = unfold(input_data.permute(0, 2, 1), kernel_size, stride)
        unfolded = unfolded.transpose(-1, -2)
        return unfolded
    def integrated_loudness(self, data: torch.Tensor):
        if not torch.is_tensor(data):
            data = torch.from_numpy(data).float()
        else:
            data = data.float()
        input_data = copy.copy(data)
        # Data always has a batch and channel dimension.
        # Is of shape (nb, nt, nch)
        if input_data.ndim < 2:
            input_data = input_data.unsqueeze(-1)
        if input_data.ndim < 3:
            input_data = input_data.unsqueeze(0)
        nb, _, nch = input_data.shape
        # Apply frequency weighting filters - account
        # for the acoustic respose of the head and auditory system
        input_data = self.apply_filter(input_data)
        G = self.G  # channel gains
        T_g = self.block_size  # 400 ms gating block standard
        Gamma_a = -70.0  # -70 LKFS = absolute loudness threshold
        unfolded = self._unfold(input_data)
        z = (1.0 / (T_g * self.rate)) * unfolded.square().sum(2)
        l = -0.691 + 10.0 * torch.log10((G[None, :nch, None] * z).sum(1, keepdim=True))
        l = l.expand_as(z)
        # find gating block indices above absolute threshold
        z_avg_gated = z
        z_avg_gated[l <= Gamma_a] = 0
        masked = l > Gamma_a
        z_avg_gated = z_avg_gated.sum(2) / masked.sum(2)
        # calculate the relative threshold value (see eq. 6)
        Gamma_r = (
            -0.691 + 10.0 * torch.log10((z_avg_gated * G[None, :nch]).sum(-1)) - 10.0
        )
        Gamma_r = Gamma_r[:, None, None]
        Gamma_r = Gamma_r.expand(nb, nch, l.shape[-1])
        # find gating block indices above relative and absolute thresholds  (end of eq. 7)
        z_avg_gated = z
        z_avg_gated[l <= Gamma_a] = 0
        z_avg_gated[l <= Gamma_r] = 0
        masked = (l > Gamma_a) * (l > Gamma_r)
        z_avg_gated = z_avg_gated.sum(2) / masked.sum(2)
        # # Cannot use nan_to_num (pytorch 1.8 does not come with GCP-supported cuda version)
        # z_avg_gated = torch.nan_to_num(z_avg_gated)
        z_avg_gated = torch.where(
            z_avg_gated.isnan(), torch.zeros_like(z_avg_gated), z_avg_gated
        )
        z_avg_gated[z_avg_gated == float("inf")] = float(np.finfo(np.float32).max)
        z_avg_gated[z_avg_gated == -float("inf")] = float(np.finfo(np.float32).min)
        LUFS = -0.691 + 10.0 * torch.log10((G[None, :nch] * z_avg_gated).sum(1))
        return LUFS.float()
 def loudness(
    audio_data, sample_rate: int, target_loudness: int, filter_class: str = "K-weighting", block_size: float = 0.400, **kwargs
 ):  
    MIN_LOUDNESS = -70
    device = audio_data.device
    original_length = audio_data.shape[-1]
    signal_duration = original_length / sample_rate
    # Pad if too short
    if signal_duration < 0.5:
        pad_len = int((0.5 - signal_duration) * sample_rate)
        audio_data = torch.nn.functional.pad(audio_data, (0, pad_len), mode="constant", value=0)
    # create BS.1770 meter
    meter = Meter(
        sample_rate, filter_class=filter_class, block_size=block_size, **kwargs
    )
    meter = meter.to(audio_data.device)
    # measure loudness
    loudness = meter.integrated_loudness(audio_data.permute(0, 2, 1))
    audio_data = audio_data[..., :original_length]
    min_loudness = (
        torch.ones_like(loudness, device=loudness.device) * MIN_LOUDNESS
    )
    _loudness = torch.maximum(loudness, min_loudness)
    _loudness = _loudness.to(device)
    delta_loudness = target_loudness - _loudness
    gain = torch.pow(torch.tensor(10.0, device=device, dtype=audio_data.dtype), delta_loudness / 20.0)
    output = gain * audio_data
    if torch.max(torch.abs(output)) >= 1.0:
        import warnings
        warnings.warn("Possible clipped samples in output.")
    return output
--- a/comfy/ldm/higgsv2/model.py
+++ b/comfy/ldm/higgsv2/model.py
--- a/comfy/ldm/higgsv2/preprocess.py
+++ b/comfy/ldm/higgsv2/preprocess.py
@ -0,0 +1,730 @@
 from typing import ( 
    Dict, List, Optional, Union, Tuple, MutableMapping, Any, Mapping, Collection, get_type_hints, get_args, get_origin
 )
 from dataclasses import dataclass, fields, _FIELDS, _FIELD, _FIELD_INITVAR, is_dataclass
 import torch.nn.functional as F
 import numpy as np
 import torch
 import math
 from functools import lru_cache
 __MAX_SIZE = 2048
 def _ceil_to_nearest(n, round_to):
    return (n + round_to - 1) // round_to * round_to
@torch.jit.script
 def build_delay_pattern_mask(
    input_ids: torch.Tensor,
    bos_token_id: int,
    pad_token_id: int,
 ):
    bsz, num_codebooks, seq_len = input_ids.shape
    new_seq_len = seq_len + num_codebooks - 1
    input_ids_with_gen_mask = torch.ones((bsz, num_codebooks, new_seq_len), dtype=torch.long, device=input_ids.device)
    bos_mask = torch.tril(input_ids_with_gen_mask, -1) > 0
    eos_mask = torch.triu(input_ids_with_gen_mask, seq_len) > 0
    input_ids_with_gen_mask[bos_mask] = bos_token_id
    input_ids_with_gen_mask[(~bos_mask) & (~eos_mask)] = input_ids.reshape(-1)
    input_ids = input_ids_with_gen_mask.clone()
    input_ids[eos_mask] = pad_token_id
    input_ids_with_gen_mask[eos_mask] = -1
    return input_ids, input_ids_with_gen_mask
 # implementation of dacite's from_dict with only necessary parts for ChatML
@lru_cache(maxsize=None)
 def cache(function):
    return lru_cache(maxsize=__MAX_SIZE, typed=True)(function)
@cache
 def get_fields(data_class):
    fields = getattr(data_class, _FIELDS)
    return [f for f in fields.values() if f._field_type is _FIELD or f._field_type is _FIELD_INITVAR]
 def is_optional(type_) -> bool:
    return get_origin(type_) is Union and type(None) in get_args(type_)
 def orig(data_class) -> Any:
    if is_dataclass(data_class):
        return data_class
    return get_origin(data_class)
@cache
 def extract_generic(type_, defaults: Tuple = ()) -> tuple:
    try:
        if getattr(type_, "_special", False):
            return defaults
        if type_.__args__ == ():
            return (type_.__args__,)
        return type_.__args__ or defaults  # type: ignore
    except AttributeError:
        return defaults
 def _build_value_for_collection(collection, data: Any) -> Any:
    if isinstance(data, Mapping):
        value_type = extract_generic(collection, defaults=(Any, Any))[1]
        return {
            key: _build_value(type_=value_type, data=value)
            for key, value in data.items()
        }
    elif isinstance(data, Collection) and not isinstance(data, (str, bytes, Mapping)):
        item_type = extract_generic(collection, defaults=(Any,))[0]
        return [
            _build_value(type_=item_type, data=item)
            for item in data
        ]
    return data
 def _build_value(type_, data) -> Any:
    if is_optional(type_) and data is None:
        return data
    if get_origin(type_) is Union:
        data = _build_value_for_union(union=type_, data=data)
    elif hasattr(type_, "__origin__"):
        data = _build_value_for_collection(collection=type_, data=data)
    elif cache(is_dataclass)(orig(type_)) and isinstance(data, Mapping):
        data = from_dict(data_class=type_, data=data)
    return data
 def _build_value_for_union(union: type, data: Any) -> Any:
    for inner_type in get_args(union):
        if data is None and inner_type is type(None):
            return None
        try:
            return _build_value(inner_type, data)
        except Exception:
            continue
    raise ValueError(f"Cannot match {data!r} to any type in {union}")
 def is_instance(value: Any, type_) -> bool:
    if type_ is Any:
        return True
    origin = get_origin(type_)
    args = get_args(type_)
    if origin is Union:
        return any(is_instance(value, arg) for arg in args)
    if origin in (list, List):
        if not isinstance(value, list):
            return False
        (elem_type,) = args or (Any,)
        return all(is_instance(item, elem_type) for item in value)
    if origin in (dict, Dict, Mapping):
        if not isinstance(value, dict):
            return False
        key_type, val_type = args or (Any, Any)
        return all(
            is_instance(k, key_type) and is_instance(v, val_type)
            for k, v in value.items()
        )
    try:
        return isinstance(value, type_)
    except TypeError:
        return False
 def from_dict(data_class, data):
    init_values: MutableMapping[str, Any] = {}
    post_init_values: MutableMapping[str, Any] = {}
    data_class_hints = get_type_hints(data_class)
    data_class_fields = cache(get_fields)(data_class)
    extra_fields = set(data.keys()) - {f.name for f in data_class_fields}
    if extra_fields:
        formatted_keys = ", ".join(f'"{key}"' for key in extra_fields)
        raise ValueError(f"cannot match {formatted_keys} to any data class field")
    for field in data_class_fields:
        field_type = data_class_hints[field.name]
        key = field.name
        if key in data:
            try:
                value = _build_value(type_=field_type, data=data[key])
            except Exception as error:
                raise ValueError(error)
            if not is_instance(value, field_type):
                raise ValueError((
                    f'wrong value type for field "{field.name}" - should be "{field_type}" '
                    f'instead of value "{value}" of type "{type(value)}"'
                ))
            init_values[field.name] = value
    instance = data_class(**init_values)
    for key, value in post_init_values.items():
        setattr(instance, key, value)
    return instance
 def normalize_chinese_punctuation(text):
    """
    Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
    """
    # Mapping of Chinese punctuation to English punctuation
    chinese_to_english_punct = {
        "，": ", ",  # comma
        "。": ".",  # period
        "：": ":",  # colon
        "；": ";",  # semicolon
        "？": "?",  # question mark
        "！": "!",  # exclamation mark
        "（": "(",  # left parenthesis
        "）": ")",  # right parenthesis
        "【": "[",  # left square bracket
        "】": "]",  # right square bracket
        "《": "<",  # left angle quote
        "》": ">",  # right angle quote
        "“": '"',  # left double quotation
        "”": '"',  # right double quotation
        "‘": "'",  # left single quotation
        "’": "'",  # right single quotation
        "、": ",",  # enumeration comma
        "—": "-",  # em dash
        "…": "...",  # ellipsis
        "·": ".",  # middle dot
        "「": '"',  # left corner bracket
        "」": '"',  # right corner bracket
        "『": '"',  # left double corner bracket
        "』": '"',  # right double corner bracket
    }
    # Replace each Chinese punctuation with its English counterpart
    for zh_punct, en_punct in chinese_to_english_punct.items():
        text = text.replace(zh_punct, en_punct)
    return text
 def transcript_normalize(text: str):
    transcript = normalize_chinese_punctuation(text)
    transcript = transcript.replace("(", " ")
    transcript = transcript.replace(")", " ")
    transcript = transcript.replace("°F", " degrees Fahrenheit")
    transcript = transcript.replace("°C", " degrees Celsius")
    for tag, replacement in [
        ("[laugh]", "<SE>[Laughter]</SE>"),
        ("[humming start]", "<SE_s>[Humming]</SE_s>"),
        ("[humming end]", "<SE_e>[Humming]</SE_e>"),
        ("[music start]", "<SE_s>[Music]</SE_s>"),
        ("[music end]", "<SE_e>[Music]</SE_e>"),
        ("[music]", "<SE>[Music]</SE>"),
        ("[sing start]", "<SE_s>[Singing]</SE_s>"),
        ("[sing end]", "<SE_e>[Singing]</SE_e>"),
        ("[applause]", "<SE>[Applause]</SE>"),
        ("[cheering]", "<SE>[Cheering]</SE>"),
        ("[cough]", "<SE>[Cough]</SE>"),
    ]:
        transcript = transcript.replace(tag, replacement)
    lines = transcript.split("\n")
    transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
    transcript = transcript.strip()
    if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
        transcript += "."
    return transcript
@dataclass
 class AudioContent:
    audio_url: str
    raw_audio: Optional[str] = None
    offset: Optional[float] = None
    duration: Optional[float] = None
    row_id: Optional[int] = None
    type: str = "audio"
@dataclass
 class TextContent:
    text: str
    type: str = "text"
@dataclass
 class Message:
    role: str
    content: Union[str, AudioContent, TextContent, List[Union[str, AudioContent, TextContent]]]
    recipient: Optional[str] = None
@dataclass
 class ChatMLSample:
    messages: List[Message]
    start_index: Optional[int] = None
    misc: Optional[Dict] = None
    speaker: Optional[str] = None
 def prepare_chatml_sample(sample: Union[ChatMLSample, Dict], tokenizer):
    try:
        if not isinstance(sample, ChatMLSample):
            # replacing pd.isna            
            def is_nan(x):
                if isinstance(x, float):
                    return math.isnan(x)
                if isinstance(x, np.generic):
                    return np.isnan(x)
                if isinstance(x, torch.Tensor) and x.numel() == 1:
                    return torch.isnan(x).item()
                return False
            if "speaker" in sample and is_nan(sample["speaker"]):
                sample["speaker"] = None
            if "start_index" in sample and is_nan(sample["start_index"]):
                sample["start_index"] = None
            if "content" in sample and is_nan(sample["content"]):
                sample["content"] = ""
            def convert_nan_to_none(obj):
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                elif isinstance(obj, float) and math.isnan(obj):
                    return None
                elif isinstance(obj, dict):
                    return {k: convert_nan_to_none(v) for k, v in obj.items()}
                elif isinstance(obj, (list, tuple)):
                    return [convert_nan_to_none(item) for item in obj]
                return obj
            clean_sample = convert_nan_to_none(sample)
            val_keys = []
            for field in fields(ChatMLSample):
                if field.name in clean_sample:
                    val_keys.append(field.name)
            clean_sample = {k: clean_sample[k] for k in val_keys}
            sample = from_dict(
                data_class=ChatMLSample, data=clean_sample,
            )
        input_tokens = []
        audio_contents = []
        speaker_id = None
        if sample.speaker is not None:
            speaker_id = sample.speaker
        elif sample.misc is not None:
            if "speaker" in sample.misc:
                speaker_id = sample.misc["speaker"]
        total_m = len(sample.messages)
        for turn_id, message in enumerate(sample.messages):
            role = message.role
            recipient = message.recipient
            content = message.content
            content_l = []
            if isinstance(content, str):
                content_l.append(TextContent(text=content))
            elif isinstance(content, TextContent):
                content_l.append(content)
            elif isinstance(content, AudioContent):
                content_l.append(content)
            elif isinstance(content, list):
                for ele in content:
                    if isinstance(ele, str):
                        content_l.append(TextContent(text=ele))
                    else:
                        content_l.append(ele)
            if turn_id == 0:
                prefix = f"<|begin_of_text|><|start_header_id|>{role}<|end_header_id|>\n\n"
            else:
                prefix = f"<|start_header_id|>{role}<|end_header_id|>\n\n"
            eot_postfix = "<|eot_id|>"
            eom_postfix = "<|eom_id|>"
            prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
            input_tokens.extend(prefix_tokens)
            if recipient:
                assert role == "assistant", "Recipient is only available for assistant role."
                recipient_tokens = tokenizer.encode(f"{recipient}<|recipient|>", add_special_tokens=False)
                input_tokens.extend(recipient_tokens)
            for content in content_l:
                if content.type == "text":
                    text_tokens = tokenizer.encode(content.text, add_special_tokens=False)
                    input_tokens.extend(text_tokens)
                elif content.type == "audio":
                    audio_contents.append(content)
                    if role == "user" or role == "system":
                        text_tokens = tokenizer.encode(
                            f"<|audio_bos|><|AUDIO|><|audio_eos|>",
                            add_special_tokens=False,
                        )
                        input_tokens.extend(text_tokens)
                    elif role == "assistant":
                        text_tokens = tokenizer.encode(
                            f"<|audio_out_bos|><|AUDIO_OUT|><|audio_eos|>",
                            add_special_tokens=False,
                        )
                        input_tokens.extend(text_tokens)
            next_id = turn_id + 1
            if role == "assistant" and next_id != total_m and sample.messages[next_id].role == "assistant":
                postfix_tokens = tokenizer.encode(eom_postfix, add_special_tokens=False)
                input_tokens.extend(postfix_tokens)
            else:
                postfix_tokens = tokenizer.encode(eot_postfix, add_special_tokens=False)
                input_tokens.extend(postfix_tokens)
        return input_tokens, audio_contents, speaker_id
    except Exception as e:
        import json
        print(f"Error in prepare_chatml_sample: {str(e)}")
        print(f"Sample data: {json.dumps(sample, indent=2)}")
        return None, None, None
@dataclass
 class HiggsAudioBatchInput:
    input_ids: torch.LongTensor  # shape (bsz, seq_len).
    attention_mask: torch.Tensor  # shape (bsz, seq_len).
    audio_out_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_out_total_length)
    audio_out_ids_start: Optional[torch.LongTensor]  # shape (num_audio_out,)
    audio_out_ids_start_group_loc: Optional[torch.LongTensor]  # shape (num_audio_out,), specify which a sample's group location in the batch
    audio_in_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_in_total_length)
    audio_in_ids_start: Optional[torch.LongTensor]  # shape (num_audio_in,)
    label_ids: Optional[torch.LongTensor]  # shape (bsz, seq_len)
    label_audio_ids: Optional[torch.LongTensor]  # shape (num_codebooks, audio_out_total_length)
    reward: Optional[float] = None
@dataclass
 class ChatMLDatasetSample:
    input_ids: torch.LongTensor  # (seq_len,) Input text tokens
    label_ids: torch.LongTensor  # (seq_len,) Label IDs
    audio_ids_concat: torch.LongTensor  # (num_codebooks, audio_seq_len) Concatenated audio tokens
    audio_ids_start: torch.LongTensor  # (num_audios,) Start index of each audio token in `audio_ids_concat`
    audio_waveforms_concat: torch.Tensor  # (total_wv_length,) Concatenated audio waveforms
    audio_waveforms_start: torch.LongTensor  # (num_audios,) Start index of each waveform in `audio_waveforms_concat`
    audio_sample_rate: torch.Tensor  # (num_audios,) Sampling rate per audio waveform
    audio_speaker_indices: torch.LongTensor  # (num_audios,) Speaker indices per audio; -1 = unknown
    audio_label_ids_concat: Optional[torch.LongTensor] = None  # (num_codebooks, audio_seq_len) Optional audio token labels
    reward: Optional[float] = None  # Optional scalar reward
    def num_audios(self):
        return max(len(self.audio_waveforms_start), len(self.audio_ids_start))
    def get_audio_codes(self, idx):
        code_start = self.audio_ids_start[idx]
        if idx < len(self.audio_ids_start) - 1:
            code_end = self.audio_ids_start[idx + 1]
        else:
            code_end = self.audio_ids_concat.shape[-1]
        return self.audio_ids_concat[:, code_start:code_end]
    def get_audio_codes_labels(self, idx):
        if self.audio_label_ids_concat is None:
            return None
        code_start = self.audio_ids_start[idx]
        if idx < len(self.audio_ids_start) - 1:
            code_end = self.audio_ids_start[idx + 1]
        else:
            code_end = self.audio_ids_concat.shape[-1]
        return self.audio_label_ids_concat[:, code_start:code_end]
    def get_wv(self, idx):
        wv_start = self.audio_waveforms_start[idx]
        sr = self.audio_sample_rate[idx]
        if idx < len(self.audio_waveforms_start) - 1:
            wv_end = self.audio_waveforms_start[idx + 1]
        else:
            wv_end = self.audio_waveforms_concat.shape[-1]
        return self.audio_waveforms_concat[wv_start:wv_end], sr
 class HiggsAudioSampleCollator:
    def __init__(
        self,
        audio_in_token_id,
        audio_out_token_id,
        pad_token_id,
        audio_stream_bos_id,
        audio_stream_eos_id,
        round_to=8,
        pad_left=False,
        return_audio_in_tokens=True,
        audio_num_codebooks=None,
        use_delay_pattern=False,
        disable_audio_codes_transform=False,
        add_new_bos_eos_for_long_chunk=True,
        mask_audio_out_token_label=True,
    ):
        self.round_to = round_to
        self.pad_left = pad_left
        self.audio_in_token_id = audio_in_token_id
        self.audio_out_token_id = audio_out_token_id
        self.audio_stream_bos_id = audio_stream_bos_id
        self.audio_stream_eos_id = audio_stream_eos_id
        self.pad_token_id = pad_token_id
        self.return_audio_in_tokens = return_audio_in_tokens
        self.audio_num_codebooks = audio_num_codebooks
        self.use_delay_pattern = use_delay_pattern
        self.disable_audio_codes_transform = disable_audio_codes_transform
        self.add_new_bos_eos_for_long_chunk = add_new_bos_eos_for_long_chunk
        self.mask_audio_out_token_label = mask_audio_out_token_label
    def _process_and_duplicate_audio_tokens(
        self, input_ids: torch.Tensor, audio_idx: int, wv: torch.Tensor, labels: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
        total_samples = len(wv)
        num_chunks = math.ceil(total_samples / self.chunk_size_samples)
        if num_chunks <= 1:
            return input_ids, labels, 1
        audio_token_seq = input_ids[audio_idx - 1 : audio_idx + 2]
        duplicated_sequence = audio_token_seq.repeat(num_chunks)
        new_input_ids = torch.cat([input_ids[: audio_idx - 1], duplicated_sequence, input_ids[audio_idx + 2 :]])
        new_labels = None
        if labels is not None:
            label_seq = labels[audio_idx - 1 : audio_idx + 2]
            duplicated_labels = label_seq.repeat(num_chunks)
            new_labels = torch.cat([labels[: audio_idx - 1], duplicated_labels, labels[audio_idx + 2 :]])
        return new_input_ids, new_labels, num_chunks
    def __call__(self, batch: List[ChatMLDatasetSample]):
        label_ids = None
        label_audio_ids = None
        if all([ele.label_ids is None for ele in batch]):
            return_labels = False
        else:
            return_labels = True
        processed_batch = batch
        # Get the max sequence length based on processed batch
        max_seq_length = _ceil_to_nearest(max([len(sample.input_ids) for sample in processed_batch]), self.round_to)
        # Get the ids for audio-in and audio-out for each batch
        audio_in_ids_l = []
        audio_out_ids_l = []
        audio_out_ids_group_loc_l = []
        audio_in_label_ids_l = None
        audio_out_label_ids_l = None
        reward_l = []
        if return_labels:
            audio_out_no_train_flag = []  # Whether the audio-out data should be trained on or not.
        # Process the audio inputs and outputs
        for i in range(len(processed_batch)):
            audio_in_mask = processed_batch[i].input_ids == self.audio_in_token_id
            audio_out_mask = processed_batch[i].input_ids == self.audio_out_token_id
            audio_ids = torch.ones_like(processed_batch[i].input_ids)
            audio_ids[audio_in_mask ^ audio_out_mask] = torch.cumsum(audio_ids[audio_in_mask ^ audio_out_mask], 0) - 1
            audio_in_ids = audio_ids[audio_in_mask]
            audio_out_ids = audio_ids[audio_out_mask]
            if return_labels:
                audio_out_no_train_flag.append(processed_batch[i].label_ids[audio_out_mask] < 0)
                if self.mask_audio_out_token_label:
                    processed_batch[i].label_ids[audio_out_mask] = -100
            if self.return_audio_in_tokens:
                audio_in_ids_l.extend(
                    [processed_batch[i].get_audio_codes(idx)[: self.audio_num_codebooks, :] for idx in audio_in_ids]
                )
                if processed_batch[i].audio_label_ids_concat is not None:
                    if audio_in_label_ids_l is None:
                        audio_in_label_ids_l = []
                    audio_in_label_ids_l.extend(
                        [
                            processed_batch[i].get_audio_codes_labels(idx)[: self.audio_num_codebooks, :]
                            for idx in audio_in_ids
                        ]
                    )
            audio_out_ids_l.extend(
                [processed_batch[i].get_audio_codes(idx)[: self.audio_num_codebooks, :] for idx in audio_out_ids]
            )
            audio_out_ids_group_loc_l.append(i)
            if processed_batch[i].reward is not None:
                reward_l.append(processed_batch[i].reward)
            if processed_batch[i].audio_label_ids_concat is not None:
                if audio_out_label_ids_l is None:
                    audio_out_label_ids_l = []
                audio_out_label_ids_l.extend(
                    [
                        processed_batch[i].get_audio_codes_labels(idx)[: self.audio_num_codebooks, :]
                        for idx in audio_out_ids
                    ]
                )
        if return_labels:
            audio_out_no_train_flag = torch.cat(audio_out_no_train_flag, dim=0)
        if len(audio_in_ids_l) > 0:
            # I tried to remove the for-loop in original implementation
            # but to do batching with padding caused problem so I turned it into a list compre.
            lengths = [seg.shape[1] for seg in audio_in_ids_l]           
            aug_lengths = [l + 2 for l in lengths]                   
            audio_in_ids_start = torch.cumsum(
                torch.tensor([0] + aug_lengths[:-1], dtype=torch.long), dim=0
            )
            if self.disable_audio_codes_transform:
                audio_in_ids = torch.cat(audio_in_ids_l, dim=1).long()
            else:
                with_tokens = [
                    torch.cat([
                        torch.full((seg.shape[0], 1), self.audio_stream_bos_id, dtype=torch.long),
                        seg,
                        torch.full((seg.shape[0], 1), self.audio_stream_eos_id, dtype=torch.long),
                    ], dim=1)
                    for seg in audio_in_ids_l
                ]
                if self.use_delay_pattern:
                    with_tokens = [
                        build_delay_pattern_mask(
                            tok.unsqueeze(0),  
                            bos_token_id=self.audio_stream_bos_id,
                            pad_token_id=self.audio_stream_eos_id
                        )[0]
                        for tok in with_tokens
                    ]
                audio_in_ids = torch.cat(with_tokens, dim=1).long()
        else:
            audio_in_ids = torch.zeros((0, 0), dtype=torch.long)
            audio_in_ids_start = torch.zeros(0, dtype=torch.long)
        audio_out_ids_start_group_loc = None
        if len(audio_out_ids_l) > 0:
            new_audio_out_ids_l = []
            label_audio_ids_l = []
            for idx, ele in enumerate(audio_out_ids_l):
                if self.disable_audio_codes_transform:
                    audio_codes = ele
                    if return_labels:
                        label_audio_ids = audio_out_label_ids_l[idx]
                else:
                    audio_codes = torch.cat(
                        [
                            torch.full((ele.shape[0], 1), self.audio_stream_bos_id, dtype=torch.long),
                            ele,
                            torch.full((ele.shape[0], 1), self.audio_stream_eos_id, dtype=torch.long),
                        ],
                        dim=1,
                    )
                    if return_labels:
                        label_audio_ids = torch.cat(
                            [
                                torch.full((ele.shape[0], 1), -100, dtype=torch.long),
                                ele,
                                torch.full((ele.shape[0], 1), self.audio_stream_eos_id, dtype=torch.long),
                            ],
                            dim=1,
                        )
                    if self.use_delay_pattern:
                        audio_codes = build_delay_pattern_mask(
                            audio_codes.unsqueeze(0),
                            bos_token_id=self.audio_stream_bos_id,
                            pad_token_id=self.audio_stream_eos_id,
                        )[0].squeeze(0)
                        if return_labels:
                            label_audio_ids = build_delay_pattern_mask(
                                label_audio_ids.unsqueeze(0),
                                bos_token_id=-100,
                                pad_token_id=-100,
                            )[0].squeeze(0)
                new_audio_out_ids_l.append(audio_codes)
                if return_labels:
                    if audio_out_no_train_flag[idx]:
                        label_audio_ids[:] = -100
                    label_audio_ids_l.append(label_audio_ids)
            audio_out_ids = torch.cat(new_audio_out_ids_l, dim=1).long()
            if return_labels:
                label_audio_ids = torch.cat(label_audio_ids_l, dim=1).long()
            audio_out_ids_start = torch.cumsum(
                torch.tensor([0] + [audio_codes.shape[1] for audio_codes in new_audio_out_ids_l[:-1]]), dim=0
            )
            audio_out_ids_start_group_loc = torch.tensor(audio_out_ids_group_loc_l, dtype=torch.long)
        else:
            audio_out_ids = torch.zeros((0, 0), dtype=torch.long)
            audio_out_ids_start = torch.zeros(0, dtype=torch.long)
            if return_labels:
                label_audio_ids = torch.zeros((0, 0), dtype=torch.long)
        reward = torch.tensor(reward_l, dtype=torch.float32)
        # cleaner and faster implementation
        def pad_sequence(seq, length, pad_value):
            if self.pad_left:
                padding = (length - len(seq), 0)
            else:
                padding = (0, length - len(seq))
            return F.pad(seq, padding, value=pad_value)
        input_ids = torch.stack([
            pad_sequence(ele.input_ids, max_seq_length, self.pad_token_id)
            for ele in processed_batch
        ])
        if return_labels:
            label_ids = torch.stack([
                pad_sequence(ele.label_ids, max_seq_length, -100)
                for ele in processed_batch
            ])
        attention_mask = torch.stack([
            pad_sequence(torch.ones_like(ele.input_ids), max_seq_length, 0)
            for ele in processed_batch
        ])
        if not self.return_audio_in_tokens:
            audio_in_ids = None
            audio_in_ids_start = None
        if self.audio_num_codebooks is not None:
            if audio_in_ids is not None:
                audio_in_ids = audio_in_ids[: self.audio_num_codebooks]
            if audio_out_ids is not None:
                audio_out_ids = audio_out_ids[: self.audio_num_codebooks]
            if label_audio_ids is not None:
                label_audio_ids = label_audio_ids[: self.audio_num_codebooks]
        return HiggsAudioBatchInput(
            input_ids=input_ids,
            attention_mask=attention_mask,
            audio_out_ids=audio_out_ids,
            audio_out_ids_start=audio_out_ids_start,
            audio_out_ids_start_group_loc=audio_out_ids_start_group_loc,
            audio_in_ids=audio_in_ids,
            audio_in_ids_start=audio_in_ids_start,
            label_ids=label_ids,
            label_audio_ids=label_audio_ids,
            reward=reward,
        )
--- a/comfy/ldm/higgsv2/test.py
+++ b/comfy/ldm/higgsv2/test.py
@ -0,0 +1,560 @@
 import asyncio
 import base64
 import math
 import torch
 import numpy as np
 from io import BytesIO
 from dataclasses import dataclass
 from typing import List, Optional, Union, Any
 from copy import deepcopy
 from transformers import AutoTokenizer
 from transformers.cache_utils import StaticCache
 from dataclasses import asdict
 from loguru import logger
 import librosa
 import os
 from tokenizer import HiggsAudioTokenizer
 import json
 from huggingface_hub import snapshot_download
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import CONFIG_MAPPING
 from model import HiggsAudioConfig
 class HiggsAudioEncoderConfig(PretrainedConfig):
    """Configuration of the Audio encoder in Higgs-Audio."""
    model_type = "higgs_audio_encoder"
    def __init__(
        self,
        num_mel_bins=128,
        encoder_layers=32,
        encoder_attention_heads=20,
        encoder_ffn_dim=5120,
        encoder_layerdrop=0.0,
        d_model=1280,
        dropout=0.0,
        attention_dropout=0.0,
        activation_function="gelu",
        activation_dropout=0.0,
        scale_embedding=False,
        init_std=0.02,
        max_source_positions=1500,
        pad_token_id=128001,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.num_mel_bins = num_mel_bins
        self.d_model = d_model
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.encoder_ffn_dim = encoder_ffn_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_function = activation_function
        self.activation_dropout = activation_dropout
        self.encoder_layerdrop = encoder_layerdrop
        self.num_hidden_layers = encoder_layers
        self.init_std = init_std
        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
        self.max_source_positions = max_source_positions
        self.pad_token_id = pad_token_id
 class HiggsAudioConfig(PretrainedConfig):
    model_type = "higgs_audio"
    is_composition = True
    def __init__(
        self,
        text_config=None,
        audio_encoder_config=None,
        audio_tokenizer_config=None,
        audio_adapter_type="stack",
        audio_embed_avg=False,
        audio_ffn_hidden_size=4096,
        audio_ffn_intermediate_size=14336,
        audio_dual_ffn_layers=None,
        audio_decoder_proj_num_layers=0,
        encode_whisper_embed=True,
        encode_audio_in_tokens=False,
        use_delay_pattern=False,
        skip_audio_tower=False,
        use_audio_out_embed_projector=False,
        use_audio_out_self_attention=False,
        use_rq_transformer=False,
        rq_transformer_hidden_size=None,
        rq_transformer_intermediate_size=None,
        rq_transformer_num_attention_heads=None,
        rq_transformer_num_key_value_heads=None,
        rq_transformer_num_hidden_layers=3,
        audio_num_codebooks=12,
        audio_codebook_size=1024,
        audio_stream_bos_id=1024,
        audio_stream_eos_id=1025,
        audio_bos_token="<|audio_bos|>",
        audio_eos_token="<|audio_eos|>",
        audio_out_bos_token="<|audio_out_bos|>",
        audio_in_token="<|AUDIO|>",
        audio_out_token="<|AUDIO_OUT|>",
        audio_in_token_idx=128015,
        audio_out_token_idx=128016,
        pad_token_id=128001,
        audio_out_bos_token_id=128013,
        audio_eos_token_id=128012,
        **kwargs,
    ):
        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
        elif text_config is None:
            text_config = CONFIG_MAPPING["llama"]()
        assert audio_adapter_type in [
            "stack",
            "dual_ffn",
            "dual_ffn_fast_forward",
        ], f"Invalid audio adapter type: {audio_adapter_type}"
        if audio_adapter_type.startswith("dual_ffn"):
            assert audio_dual_ffn_layers is not None, (
                "audio_dual_ffn_layers must be specified when using dual_ffn adapter."
            )
        self.text_config = text_config
        self.audio_encoder_config = audio_encoder_config
        self.audio_tokenizer_config = audio_tokenizer_config
        self.audio_adapter_type = audio_adapter_type
        self.audio_embed_avg = audio_embed_avg
        self.audio_ffn_hidden_size = audio_ffn_hidden_size
        self.audio_ffn_intermediate_size = audio_ffn_intermediate_size
        self.audio_dual_ffn_layers = audio_dual_ffn_layers
        self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers
        self.encode_whisper_embed = encode_whisper_embed
        self.encode_audio_in_tokens = encode_audio_in_tokens
        self.use_delay_pattern = use_delay_pattern
        self.skip_audio_tower = skip_audio_tower
        self.use_audio_out_embed_projector = use_audio_out_embed_projector
        self.use_audio_out_self_attention = use_audio_out_self_attention
        self.use_rq_transformer = use_rq_transformer
        if self.use_rq_transformer:
            assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!"
        self.rq_transformer_hidden_size = rq_transformer_hidden_size
        self.rq_transformer_intermediate_size = rq_transformer_intermediate_size
        self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads
        self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads
        self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers
        if use_rq_transformer:
            # For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified.
            if self.rq_transformer_hidden_size is None:
                self.rq_transformer_hidden_size = text_config.hidden_size
            assert self.rq_transformer_hidden_size % 128 == 0
            if self.rq_transformer_intermediate_size is None:
                self.rq_transformer_intermediate_size = text_config.intermediate_size
            if self.rq_transformer_num_attention_heads is None:
                self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128
            if self.rq_transformer_num_key_value_heads is None:
                self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4
            assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0
            assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0
        self.audio_num_codebooks = audio_num_codebooks
        self.audio_codebook_size = audio_codebook_size
        self.audio_bos_token = audio_bos_token
        self.audio_eos_token = audio_eos_token
        self.audio_out_bos_token = audio_out_bos_token
        self.audio_in_token = audio_in_token
        self.audio_out_token = audio_out_token
        self.audio_in_token_idx = audio_in_token_idx
        self.audio_out_token_idx = audio_out_token_idx
        self.audio_stream_bos_id = audio_stream_bos_id
        self.audio_stream_eos_id = audio_stream_eos_id
        self.audio_out_bos_token_id = audio_out_bos_token_id
        self.audio_eos_token_id = audio_eos_token_id
        super().__init__(**kwargs)
        self.pad_token_id = pad_token_id
 from model import HiggsAudioModel
 from preprocess import HiggsAudioSampleCollator, ChatMLSample, ChatMLDatasetSample, prepare_chatml_sample, Message
 def load_higgs_audio_tokenizer(tokenizer_name_or_path, device="cuda"):
    is_local = os.path.exists(tokenizer_name_or_path)
    if not is_local:
        tokenizer_path = snapshot_download(tokenizer_name_or_path)
    else:
        tokenizer_path = tokenizer_name_or_path
    config_path = os.path.join(tokenizer_path, "config.json")
    model_path = os.path.join(tokenizer_path, "model.pth")
    config = json.load(open(config_path))
    model = HiggsAudioTokenizer(
        **config,
        device=device,
    )
    parameter_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(parameter_dict, strict=False)
    model.to(device)
    model.eval()
    return model
 def revert_delay_pattern(data):
    """Convert samples encoded with delay pattern back to the original form.
    Args:
        data (:obj:`torch.Tensor`):
            The data with delay pattern applied. It will have shape (num_codebooks, seq_len + num_codebooks - 1).
    Returns:
        ret (:obj:`torch.Tensor`):
            Recovered data with delay pattern removed. It will have shape (num_codebooks, seq_len).
    """
    assert len(data.shape) == 2
    out_l = []
    num_codebooks = data.shape[0]
    for i in range(num_codebooks):
        out_l.append(data[i : (i + 1), i : (data.shape[1] - num_codebooks + 1 + i)])
    return torch.cat(out_l, dim=0)
@dataclass
 class HiggsAudioStreamerDelta:
    """Represents a chunk of generated content, either text or audio tokens."""
    text: Optional[str] = None
    text_tokens: Optional[torch.Tensor] = None
    audio_tokens: Optional[torch.Tensor] = None
    finish_reason: Optional[str] = None
@dataclass
 class HiggsAudioResponse:
    audio: Optional[np.ndarray] = None
    generated_audio_tokens: Optional[np.ndarray] = None
    sampling_rate: Optional[int] = None
    generated_text: str = ""
    generated_text_tokens: Optional[np.ndarray] = None
    usage: Optional[dict] = None
 class HiggsAudioServeEngine:
    def __init__(
        self,
        model_name_or_path: str,
        audio_tokenizer_name_or_path: str,
        tokenizer_name_or_path: Optional[str] = None,
        device: str = "cuda",
        torch_dtype: Union[torch.dtype, str] = torch.float16,
        kv_cache_lengths: List[int] = [1024, 4096, 8192],  # Multiple KV cache sizes
    ):
        self.device = device
        self.model_name_or_path = model_name_or_path
        self.torch_dtype = torch_dtype
        torch.set_default_device("cuda")
        # Initialize model and tokenizer
        config = HiggsAudioConfig.from_pretrained(
            "bosonai/higgs-audio-v2-generation-3B-base",
            #trust_remote_code=True  
        )
        #config.num_hidden_layers = config.num_hidden_layers // 2
        # ---- Audio Config ----
        #config.audio_dual_ffn_layers = config.audio_dual_ffn_layers[:12]
        #config.audio_ffn_hidden_size //= 2                # 3072 → 1536
        #config.audio_ffn_intermediate_size //= 2          # 8192 → 4096
        # ---- Text Config ----
        #config.text_config.hidden_size //= 2              # 3072 → 1536
        #config.text_config.intermediate_size //= 2        # 8192 → 4096
        #config.text_config.num_attention_heads //= 2      # 24 → 12
        #config.text_config.num_key_value_heads //= 2      # 8 → 4
        #config.text_config.num_hidden_layers //= 2        # 28 → 14
        #config.text_config.head_dim //= 2                 # 128 → 64
        # ---- Shared ----
        #config.hidden_size //= 2                          # 3072 → 1536
        self.model = HiggsAudioModel.from_pretrained(model_name_or_path, torch_dtype = torch_dtype, config = HiggsAudioConfig.from_pretrained(model_name_or_path))#(config = config, device = device, operations = torch.nn, dtype = torch_dtype)
        print(self.model.device)
        self.model.config = config
        logger.info(f"Loaded model from {model_name_or_path}, dtype: {self.model.dtype}")
        if tokenizer_name_or_path is None:
            tokenizer_name_or_path = model_name_or_path
        logger.info(f"Loading tokenizer from {tokenizer_name_or_path}")
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
        logger.info(f"Initializing Higgs Audio Tokenizer")
        self.audio_tokenizer = load_higgs_audio_tokenizer(audio_tokenizer_name_or_path, device=device)
        self.audio_num_codebooks = self.model.config.audio_num_codebooks
        self.audio_codebook_size = self.model.config.audio_codebook_size
        self.audio_tokenizer_tps = self.audio_tokenizer.tps
        self.samples_per_token = int(self.audio_tokenizer.sampling_rate // self.audio_tokenizer_tps)
        self.hamming_window_len = 2 * self.audio_num_codebooks * self.samples_per_token
        # Set the audio special tokens
        # Prepare KV caches for different lengths
        cache_config = deepcopy(self.model.config.text_config)
        cache_config.num_hidden_layers = self.model.config.text_config.num_hidden_layers
        if self.model.config.audio_dual_ffn_layers:
            cache_config.num_hidden_layers += len(self.model.config.audio_dual_ffn_layers)
        # A list of KV caches for different lengths
        self.kv_caches = {
            length: StaticCache(
                config=cache_config,
                max_batch_size=1,
                max_cache_len=length,
                device=self.model.device,
                dtype=self.model.dtype,
            )
            for length in sorted(kv_cache_lengths)
        }
        # Reuse collator to prepare inference samples
        self.collator = HiggsAudioSampleCollator(
            audio_in_token_id=self.model.config.audio_in_token_idx,
            audio_out_token_id=self.model.config.audio_out_token_idx,
            audio_stream_bos_id=self.model.config.audio_stream_bos_id,
            audio_stream_eos_id=self.model.config.audio_stream_eos_id,
            pad_token_id=self.model.config.pad_token_id,
            return_audio_in_tokens=False,
            use_delay_pattern=self.model.config.use_delay_pattern,
            audio_num_codebooks=self.model.config.audio_num_codebooks,
            round_to=1,
        )
        # Capture CUDA graphs for each KV cache length
        #if device == "cuda":
         #   logger.info(f"Capturing CUDA graphs for each KV cache length")
          #  self.model.capture_model(self.kv_caches.values())
    def _prepare_inputs(self, chat_ml_sample: ChatMLSample, force_audio_gen: bool = False):
        input_tokens, audio_contents, _ = prepare_chatml_sample(
            chat_ml_sample,
            self.tokenizer,
        )
        postfix = "<|start_header_id|>assistant<|end_header_id|>\n\n"
        if force_audio_gen:
            postfix += "<|audio_out_bos|>"
        postfix = self.tokenizer.encode(postfix, add_special_tokens=False)
        input_tokens.extend(postfix)
        # Configure the audio inputs
        audio_ids_l = []
        for audio_content in audio_contents:
            if audio_content.audio_url not in ["placeholder", ""]:
                raw_audio, _ = librosa.load(audio_content.audio_url, sr=self.audio_tokenizer.sampling_rate)
            elif audio_content.raw_audio is not None:
                raw_audio, _ = librosa.load(
                    BytesIO(base64.b64decode(audio_content.raw_audio)), sr=self.audio_tokenizer.sampling_rate
                )
            else:
                raw_audio = None
            if raw_audio is not None:
                audio_ids = self.audio_tokenizer.encode(raw_audio, self.audio_tokenizer.sampling_rate)
                audio_ids_l.append(audio_ids.squeeze(0).cpu())
        if len(audio_ids_l) > 0:
            audio_ids_start = torch.tensor(
                np.cumsum(np.array([0] + [audio_ids.shape[1] for audio_ids in audio_ids_l])),
                dtype=torch.long,
                device=self.device,
            )[0:-1]
            audio_ids_concat = torch.cat(audio_ids_l, dim=1)
        else:
            audio_ids_start = None
            audio_ids_concat = None
        sample = ChatMLDatasetSample(
            input_ids=torch.LongTensor(input_tokens),
            label_ids=None,
            audio_ids_concat=audio_ids_concat,
            audio_ids_start=audio_ids_start,
            audio_waveforms_concat=None,
            audio_waveforms_start=None,
            audio_sample_rate=None,
            audio_speaker_indices=None,
        )
        data = self.collator([sample])
        inputs = asdict(data)
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(self.model.device)
        return inputs
    def _prepare_kv_caches(self):
        for kv_cache in self.kv_caches.values():
            kv_cache.reset()
    def generate(
        self,
        chat_ml_sample: ChatMLSample,
        max_new_tokens: int,
        temperature: float = 0.7,
        top_k: Optional[int] = None,
        top_p: float = 0.95,
        stop_strings: Optional[List[str]] = None,
        force_audio_gen: bool = False,
        ras_win_len: Optional[int] = 7,
        ras_win_max_num_repeat: int = 2,
        seed: Optional[int] = None,
    ):
        # Default stop strings
        if stop_strings is None:
            stop_strings = ["<|end_of_text|>", "<|eot_id|>"]
        if ras_win_len is not None and ras_win_len <= 0:
            ras_win_len = None
        with torch.no_grad():
            inputs = self._prepare_inputs(chat_ml_sample, force_audio_gen=force_audio_gen)
            prompt_token_ids = inputs["input_ids"][0].cpu().numpy()
            self._prepare_kv_caches()
            from autoregressive_sampling import auto_sample
            outputs = auto_sample(
                self.model,
                **inputs,
                max_new_tokens=max_new_tokens,
                use_cache=True,
                stop_strings=stop_strings,
                tokenizer=self.tokenizer,
                do_sample=False if temperature == 0.0 else True,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                past_key_values_buckets=self.kv_caches,
                ras_win_len=ras_win_len,
                ras_win_max_num_repeat=ras_win_max_num_repeat,
                seed=seed,
            )
            if len(outputs) > 0:
                wv_list = []
                for output_audio in outputs:
                    vq_code = revert_delay_pattern(output_audio).clip(0, self.audio_codebook_size - 1)[:, 1:-1]
                    wv_numpy = self.audio_tokenizer.decode(vq_code.unsqueeze(0))[0, 0]
                    wv_list.append(wv_numpy)
                wv_numpy = torch.cat(wv_list)
                wv_numpy = wv_numpy.cpu().numpy()
            else:
                wv_numpy = None
            # We only support one request at a time now
            #generated_text_tokens = outputs[0][0].cpu().numpy()[len(prompt_token_ids) :]
            #generated_text = self.tokenizer.decode(generated_text_tokens)
            #print(generated_text)
            generated_audio_tokens = outputs[0].cpu().numpy()
            return HiggsAudioResponse(
                audio=wv_numpy,
                generated_audio_tokens=generated_audio_tokens,
                sampling_rate=self.audio_tokenizer.sampling_rate,
                #generated_text=generated_text,
                #generated_text_tokens=generated_text_tokens,
                usage={
                    "prompt_tokens": prompt_token_ids.shape[0],
                   # "completion_tokens": generated_text_tokens.shape[0] + generated_audio_tokens.shape[1],
                    "total_tokens": (
                        prompt_token_ids.shape[0] #+ generated_text_tokens.shape[0] + generated_audio_tokens.shape[1]
                    ),
                    "cached_tokens": 0,
                },
            )
 def get_zero_shot_input_sample():
    system_prompt = (
        "Generate audio following instruction.\n\n<|scene_desc_start|>\nSPEAKER0: british accent\n<|scene_desc_end|>"
    )
    messages = [
        Message(
            role="system",
            content=system_prompt,
        ),
        Message(
            role="user",
            content="Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
            "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
            "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.",
        ),
    ]
    chat_ml_sample = ChatMLSample(messages=messages)
    return chat_ml_sample
 def _update_model_kwargs_for_generation(
    self,
    outputs,
    model_kwargs: dict[str, Any],
    num_new_tokens: int = 1,
 ) -> dict[str, Any]:
    # past_key_values will be the standard name for kv cache naming
    model_kwargs["past_key_values"] = outputs.past_key_values
    # thinking above removing token_type_ids
    # update token_type_ids with last value
    if "token_type_ids" in model_kwargs:
        token_type_ids = model_kwargs["token_type_ids"]
        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
    # update attention mask
    if "attention_mask" in model_kwargs:
        attention_mask = model_kwargs["attention_mask"]
        model_kwargs["attention_mask"] = torch.cat(
            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
        )
    if model_kwargs.get("use_cache", True):
        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
    else:
        past_positions = model_kwargs.pop("cache_position")
        new_positions = torch.arange(
            past_positions[-1] + 1, past_positions[-1] + num_new_tokens + 1, dtype=past_positions.dtype
        ).to(past_positions.device)
        model_kwargs["cache_position"] = torch.cat((past_positions, new_positions))
    return model_kwargs
 MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
 AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
 def main():
    input_sample = get_zero_shot_input_sample()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Using device: {device}")
    torch.manual_seed(2025)
    torch.cuda.manual_seed_all(2025)
    torch.cuda.manual_seed(2025)
    serve_engine = HiggsAudioServeEngine(
        MODEL_PATH,
        AUDIO_TOKENIZER_PATH,
        device=device,
    )
    import time
    logger.info("Starting generation...")
    start_time = time.time()
    output: HiggsAudioResponse = serve_engine.generate(
        chat_ml_sample=input_sample,
        max_new_tokens=1024,
        top_k=50,
        stop_strings=["<|end_of_text|>", "<|eot_id|>"],
    )
    elapsed_time = time.time() - start_time
    logger.info(f"Generation time: {elapsed_time:.2f} seconds")
    print(output)
    import torchaudio
    torchaudio.save(f"output_.wav", torch.from_numpy(output.audio)[None, :], output.sampling_rate)
    logger.info(f"Generated text:\n{output.generated_text}")
    logger.info(f"Saved audio to output_.wav")
 if __name__ == "__main__":
    main()
--- a/comfy/ldm/higgsv2/test_model.py
+++ b/comfy/ldm/higgsv2/test_model.py
--- a/comfy/ldm/higgsv2/tokenizer.py
+++ b/comfy/ldm/higgsv2/tokenizer.py
@ -0,0 +1,872 @@
 import os
 import math
 import torch
 import torch.nn as nn
 from typing import Optional
 import torch.nn.functional as F
 from torch.nn.utils.parametrizations import weight_norm
 import torchaudio
 import numpy as np
 from torch import vmap
 from transformers import AutoModel
 def WNConv1d(*args, device = None, dtype = None, operations = None, **kwargs):
    return weight_norm(operations.Conv1d(*args, **kwargs, device = device, dtype = dtype))
 def WNConvTranspose1d(*args, device = None, dtype = None, operations = None, **kwargs):
    return weight_norm(operations.ConvTranspose1d(*args, **kwargs, device = device, dtype = dtype))
@torch.jit.script
 def snake(x, alpha):
    shape = x.shape
    x = x.reshape(shape[0], shape[1], -1)
    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
    x = x.reshape(shape)
    return x
 class Snake1d(nn.Module):
    def __init__(self, channels, device = None, dtype = None):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(1, channels, 1, device = device, dtype = dtype))
    def forward(self, x):
        return snake(x, self.alpha)
 class DACResidualUnit(nn.Module):
    def __init__(self, dim: int = 16, dilation: int = 1, device = None, dtype = None, operations = None):
        super().__init__()
        pad = ((7 - 1) * dilation) // 2
        self.block = nn.Sequential(
            Snake1d(dim, device = device, dtype = dtype),
            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad, device = device, dtype = dtype, operations = operations),
            Snake1d(dim, device = device, dtype = dtype),
            WNConv1d(dim, dim, kernel_size=1, device = device, dtype = dtype, operations = operations),
        )
    def forward(self, x):
        y = self.block(x)
        pad = (x.shape[-1] - y.shape[-1]) // 2
        if pad > 0:
            x = x[..., pad:-pad]
        return x + y
 class DACEncoderBlock(nn.Module):
    def __init__(self, dim: int = 16, stride: int = 1, device = None, dtype = None, operations = None):
        super().__init__()
        self.block = nn.Sequential(
            DACResidualUnit(dim // 2, dilation=1, device = device, dtype = dtype, operations = operations),
            DACResidualUnit(dim // 2, dilation=3, device = device, dtype = dtype, operations = operations),
            DACResidualUnit(dim // 2, dilation=9, device = device, dtype = dtype, operations = operations),
            Snake1d(dim // 2),
            WNConv1d(
                dim // 2,
                dim,
                kernel_size=2 * stride,
                stride=stride,
                padding=math.ceil(stride / 2),
                device = device, dtype = dtype, operations = operations
            ),
        )
    def forward(self, x):
        return self.block(x)
 class DACEncoder(nn.Module):
    def __init__(
        self,
        d_model: int = 64,
        strides: list = [2, 4, 8, 8],
        d_latent: int = 256,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        # Create first convolution
        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations)]
        # Create EncoderBlocks that double channels as they downsample by `stride`
        for stride in strides:
            d_model *= 2
            self.block += [DACEncoderBlock(d_model, stride=stride, device = device, dtype = dtype, operations = operations)]
        # Create last convolution
        self.block += [
            Snake1d(d_model),
            WNConv1d(d_model, d_latent, kernel_size=3, padding=1, device = device, dtype = dtype, operations = operations),
        ]
        # Wrap black into nn.Sequential
        self.block = nn.Sequential(*self.block)
        self.enc_dim = d_model
    def forward(self, x):
        return self.block(x)
 class DACDecoderBlock(nn.Module):
    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1, device = None, dtype = None, operations = None):
        super().__init__()
        self.block = nn.Sequential(
            Snake1d(input_dim, device = device, dtype = dtype),
            WNConvTranspose1d(
                input_dim,
                output_dim,
                kernel_size=2 * stride,
                stride=stride,
                padding=math.ceil(stride / 2),
                output_padding=stride % 2,  # out_pad,
                device = device, dtype = dtype, operations = operations
            ),
            DACResidualUnit(output_dim, dilation=1, device = device, dtype = dtype, operations = operations),
            DACResidualUnit(output_dim, dilation=3, device = device, dtype = dtype, operations = operations),
            DACResidualUnit(output_dim, dilation=9, device = device, dtype = dtype, operations = operations),
        )
    def forward(self, x):
        return self.block(x)
 class DACDecoder(nn.Module):
    def __init__(
        self,
        input_channel,
        channels,
        rates,
        d_out: int = 1,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        # Add first conv layer
        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations )]
        # Add upsampling + MRF blocks
        for i, stride in enumerate(rates):
            input_dim = channels // 2**i
            output_dim = channels // 2 ** (i + 1)
            layers += [DACDecoderBlock(input_dim, output_dim, stride, device = device, dtype = dtype, operations = operations)]
        # Add final conv layer
        layers += [
            Snake1d(output_dim, device = device, dtype = dtype),
            WNConv1d(output_dim, d_out, kernel_size=7, padding=3, device = device, dtype = dtype, operations = operations),
        ]
        self.model = nn.Sequential(*layers)
    def forward(self, x):
        return self.model(x)
 class Conv1d1x1:
    def __new__(cls, in_channels, out_channels, bias=True, device=None, dtype=None, operations=None):
        operations = operations or nn
        return operations.Conv1d(
            in_channels, out_channels, kernel_size=1,
            bias=bias, device=device, dtype=dtype
        )
 class Conv1d(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = -1,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        if padding < 0:
            padding = (kernel_size - 1) // 2 * dilation
        self.dilation = dilation
        self.conv = operations.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
            device = device, dtype = dtype
        )
    def forward(self, x):
        x = self.conv(x)
        return x
 class ConvTranspose1d(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        padding=-1,
        output_padding=-1,
        groups=1,
        bias=True,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        if padding < 0:
            padding = (stride + 1) // 2
        if output_padding < 0:
            output_padding = 1 if stride % 2 else 0
        self.deconv = operations.ConvTranspose1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            output_padding=output_padding,
            groups=groups,
            bias=bias,
            device = device, dtype = dtype
        )
    def forward(self, x):
        x = self.deconv(x)
        return x
 class ResidualUnit(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size=3,
        dilation=1,
        bias=False,
        nonlinear_activation="ELU",
        nonlinear_activation_params={},
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
        self.conv1 = Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=1,
            dilation=dilation,
            bias=bias,
            device = device, dtype = dtype, operations = operations
        )
        self.conv2 = Conv1d1x1(out_channels, out_channels, bias, device = device, dtype = dtype, operations = operations)
    def forward(self, x):
        y = self.conv1(self.activation(x))
        y = self.conv2(self.activation(y))
        return x + y
 class EncoderBlock(nn.Module):
    def __init__(
        self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True, device = None, dtype = None, operations = None
    ):
        super().__init__()
        self.res_units = torch.nn.ModuleList()
        for dilation in dilations:
            self.res_units += [ResidualUnit(in_channels, in_channels, kernel_size=unit_kernel_size, dilation=dilation, device = device, dtype = dtype, operations = operations)]
        self.num_res = len(self.res_units)
        kernel_size=3 if stride == 1 else (2 * stride) # special case: stride=1, do not use kernel=2
        self.conv = Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size = kernel_size, 
            stride=stride,
            bias=bias,
            device = device, dtype = dtype, operations = operations
        )
    def forward(self, x):
        for idx in range(self.num_res):
            x = self.res_units[idx](x)
        x = self.conv(x)
        return x
 class Encoder(nn.Module):
    def __init__(
        self,
        input_channels: int,
        encode_channels: int,
        channel_ratios=(1, 1),
        strides=(1, 1),
        kernel_size=3,
        bias=True,
        block_dilations=(1, 1),
        unit_kernel_size=3,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        assert len(channel_ratios) == len(strides)
        self.conv = Conv1d(
            in_channels=input_channels, out_channels=encode_channels, kernel_size=kernel_size, stride=1, bias=False,
            device = device, dtype = dtype, operations = operations
        )
        self.conv_blocks = torch.nn.ModuleList()
        in_channels = encode_channels
        for idx, stride in enumerate(strides):
            out_channels = int(encode_channels * channel_ratios[idx])  # could be float
            self.conv_blocks += [
                EncoderBlock(
                    in_channels,
                    out_channels,
                    stride,
                    dilations=block_dilations,
                    unit_kernel_size=unit_kernel_size,
                    bias=bias,
                    device = device, dtype = dtype, operations = operations
                )
            ]
            in_channels = out_channels
        self.num_blocks = len(self.conv_blocks)
        self.out_channels = out_channels
    def forward(self, x):
        x = self.conv(x)
        for i in range(self.num_blocks):
            x = self.conv_blocks[i](x)
        return x
 class DecoderBlock(nn.Module):
    """Decoder block (no up-sampling)"""
    def __init__(
        self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True, device = None, dtype = None, operations = None
    ):
        super().__init__()
        if stride == 1:
            self.conv = Conv1d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=3,  # fix kernel=3 when stride=1 for unchanged shape
                stride=stride,
                bias=bias,
                device = device, dtype = dtype, operations = operations
            )
        else:
            self.conv = ConvTranspose1d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=(2 * stride),
                stride=stride,
                bias=bias,
                device = device, dtype = dtype, operations = operations
            )
        self.res_units = nn.ModuleList([
            ResidualUnit(out_channels, out_channels, kernel_size=unit_kernel_size, dilation=d, device = device, dtype = dtype, operations = operations)
            for d in dilations
        ])
        self.num_res = len(self.res_units)
    def forward(self, x):
        x = self.conv(x)
        for idx in range(self.num_res):
            x = self.res_units[idx](x)
        return x
 class Decoder(nn.Module):
    def __init__(
        self,
        code_dim: int,
        output_channels: int,
        decode_channels: int,
        channel_ratios=(1, 1),
        strides=(1, 1),
        kernel_size=3,
        bias=True,
        block_dilations=(1, 1),
        unit_kernel_size=3,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        assert len(channel_ratios) == len(strides)
        self.conv1 = Conv1d(
            in_channels=code_dim,
            out_channels=int(decode_channels * channel_ratios[0]),
            kernel_size=kernel_size,
            stride=1,
            bias=False,
            device = device, dtype = dtype, operations = operations
        )
        self.conv_blocks = torch.nn.ModuleList()
        for idx, stride in enumerate(strides):
            in_channels = int(decode_channels * channel_ratios[idx])
            if idx < (len(channel_ratios) - 1):
                out_channels = int(decode_channels * channel_ratios[idx + 1])
            else:
                out_channels = decode_channels
            self.conv_blocks += [
                DecoderBlock(
                    in_channels,
                    out_channels,
                    stride,
                    dilations=block_dilations,
                    unit_kernel_size=unit_kernel_size,
                    bias=bias,
                    device = device, dtype = dtype, operations = operations
                )
            ]
        self.num_blocks = len(self.conv_blocks)
        self.conv2 = Conv1d(out_channels, output_channels, kernel_size = 3, bias=False, device = device, dtype = dtype, operations = operations)
    def forward(self, z):
        x = self.conv1(z)
        for i in range(self.num_blocks):
            x = self.conv_blocks[i](x)
        x = self.conv2(x)
        return x
 class HiggsAudioFeatureExtractor(nn.Module):
    def __init__(self, sampling_rate=16000):
        super().__init__()
        self.sampling_rate = sampling_rate
    def forward(self, audio_signal):
        audio_signal = audio_signal.unsqueeze(0)
        if len(audio_signal.shape) < 3:
            audio_signal = audio_signal.unsqueeze(0)
        return {"input_values": audio_signal}
 def uniform_init(*shape: int, device = None, dtype = None):
    t = torch.empty(shape, device = device, dtype = dtype)
    nn.init.kaiming_uniform_(t)
    return t
 class EuclideanCodebook(nn.Module):
    def __init__(
        self,
        dim: int,
        codebook_size: int,
        kmeans_init: int = False,
        kmeans_iters: int = 10,
        decay: float = 0.99,
        epsilon: float = 1e-5,
        threshold_ema_dead_code: int = 2,
        device = None, dtype = None
    ):
        super().__init__()
        self.decay = decay
        init_fn = uniform_init
        embed = init_fn(codebook_size, dim, device = device, dtype = dtype)
        self.codebook_size = codebook_size
        self.kmeans_iters = kmeans_iters
        self.epsilon = epsilon
        self.threshold_ema_dead_code = threshold_ema_dead_code
        # Flag variable to indicate whether the codebook is initialized
        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
        # Runing EMA cluster size/count: N_i^t in eq. (6) in vqvae paper
        self.register_buffer("cluster_size", torch.zeros(codebook_size))
        # Codebook
        self.register_buffer("embed", embed)
        # EMA codebook: eq. (7) in vqvae paper
        self.register_buffer("embed_avg", embed.clone())
    def preprocess(self, x):
        x = x.view(-1, x.shape[-1])
        return x
    def quantize(self, x):
        embed = self.embed.t()
        if x.dtype != embed.dtype:
            x = x.to(embed.dtype)
        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
        embed_ind = dist.max(dim=-1).indices
        return embed_ind
    def postprocess_emb(self, embed_ind, shape):
        return embed_ind.view(*shape[:-1])
    def dequantize(self, embed_ind):
        quantize = F.embedding(embed_ind, self.embed)
        return quantize
    def encode(self, x):
        shape = x.shape
        # pre-process
        x = self.preprocess(x)  # [B, T, D] -> [B*T, D]
        # quantize
        embed_ind = self.quantize(x)
        # post-process
        embed_ind = self.postprocess_emb(embed_ind, shape)
        return embed_ind
    def decode(self, embed_ind):
        quantize = self.dequantize(embed_ind)
        return quantize
    def forward(self, x):
        orig_shape = x.shape  # [B, T, D]  
        flat = x.view(-1, x.shape[-1]) # [B*T, D]
        embed_ind = self.quantize(flat)             
        embed_ind = self.postprocess_emb(embed_ind, orig_shape)  
        # now embed_ind has shape [B, T]
        quantize = self.dequantize(embed_ind) 
        # quantize: [B, T, D]
        return quantize, embed_ind
 class VectorQuantization(nn.Module):
    def __init__(
        self,
        dim: int,
        codebook_size: int,
        codebook_dim: Optional[int] = None,
        decay: float = 0.99,
        epsilon: float = 1e-5,
        kmeans_init: bool = True,
        kmeans_iters: int = 50,
        threshold_ema_dead_code: int = 2,
        commitment_weight: float = 1.0,
        device = None, dtype = None, operations = None
    ):
        super().__init__()
        _codebook_dim: int = codebook_dim if codebook_dim is not None else dim
        requires_projection = _codebook_dim != dim
        self.project_in = operations.Linear(dim, _codebook_dim, device = device, dtype = dtype) if requires_projection else nn.Identity()
        self.project_out = operations.Linear(_codebook_dim, dim, device = device, dtype = dtype) if requires_projection else nn.Identity()
        self.epsilon = epsilon
        self.commitment_weight = commitment_weight
        self._codebook = EuclideanCodebook(
            dim=_codebook_dim,
            codebook_size=codebook_size,
            kmeans_init=kmeans_init,
            kmeans_iters=kmeans_iters,
            decay=decay,
            epsilon=epsilon,
            threshold_ema_dead_code=threshold_ema_dead_code,
            device = device, dtype = dtype
        )
        self.codebook_size = codebook_size
    @property
    def codebook(self):
        return self._codebook.embed
    def encode(self, x):
        x = x.permute(0, 2, 1)
        x = self.project_in(x)
        embed_in = self._codebook.encode(x)
        return embed_in
    def decode(self, embed_ind):
        quantize = self._codebook.decode(embed_ind)
        quantize = self.project_out(quantize)
        quantize = quantize.permute(0, 2, 1)
        return quantize
    def forward(self, x):
        device = x.device
        x = x.transpose(1, 2).contiguous()  # [b d n] -> [b n d]
        x = self.project_in(x)
        quantize, embed_ind = self._codebook(x)
        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
        quantize = self.project_out(quantize)
        quantize = quantize.transpose(1, 2).contiguous()  # [b n d] -> [b d n]
        return quantize, embed_ind, loss
 class ResidualVectorQuantization(nn.Module):
    def __init__(self, *, num_quantizers, device = None, dtype = None, operations = None, **kwargs):
        super().__init__()
        self.layers = nn.ModuleList([VectorQuantization(device = device, dtype = dtype, operations = operations, **kwargs) for _ in range(num_quantizers)])
    def forward(self, x, n_q: Optional[int] = None):
        quantized_out = 0.0
        residual = x
        all_losses = []
        all_indices = []
        n_q = n_q or len(self.layers)
        for layer in self.layers[:n_q]:
            quantized, indices, loss = layer(residual)
            residual = residual - quantized
            quantized_out = quantized_out + quantized
            all_indices.append(indices)
            all_losses.append(loss)
        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
        return quantized_out, out_indices, out_losses
    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
        """ Vectorized Implementation of dequantization | 2x faster than original impl """
        biases = torch.stack([layer.project_out.bias for layer in self.layers])
        codebook_device = self.layers[0]._codebook.embed.device
        q_indices = q_indices.to(codebook_device)
        def decode_one(codebook_weight, proj_weight, embed_id, proj_biases):
            quantized = F.embedding(embed_id, codebook_weight).transpose(1, 2)  # (B, D, T)
            quantized = F.linear(quantized.transpose(1, 2), proj_weight, proj_biases).transpose(1, 2)
            return quantized
        codebook_weights = torch.stack([q._codebook.embed for q in self.layers])       # (n_codebooks, vocab_size, D)
        proj_weights = torch.stack([q.project_out.weight for q in self.layers])   
        quantized = vmap(decode_one)(codebook_weights, proj_weights, q_indices, biases)
        return quantized.sum(0)
 class ResidualVectorQuantizer(nn.Module):
    def __init__(
        self,
        dimension: int = 256,
        codebook_dim: int = None,
        n_q: int = 8,
        bins: int = 1024,
        decay: float = 0.99,
        kmeans_init: bool = True,
        kmeans_iters: int = 50,
        threshold_ema_dead_code: int = 2,
        device = None,
        dtype = None,
        operations = None
    ):
        super().__init__()
        self.n_q = n_q
        self.dimension = dimension
        self.codebook_dim = codebook_dim
        self.bins = bins
        self.decay = decay
        self.kmeans_init = kmeans_init
        self.kmeans_iters = kmeans_iters
        self.threshold_ema_dead_code = threshold_ema_dead_code
        self.vq = ResidualVectorQuantization(
            dim=self.dimension,
            codebook_dim=self.codebook_dim,
            codebook_size=self.bins,
            num_quantizers=self.n_q,
            decay=self.decay,
            kmeans_init=self.kmeans_init,
            kmeans_iters=self.kmeans_iters,
            threshold_ema_dead_code=self.threshold_ema_dead_code,
            device = device, dtype = dtype, operations = operations
        )
    def forward(self, x: torch.Tensor, sample_rate: int, bandwidth: Optional[float] = None):  # -> QuantizedResult:
        bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
        n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
        bw = torch.tensor(n_q * bw_per_q).to(x)
        return quantized, codes, bw, torch.mean(commit_loss)
    def get_num_quantizers_for_bandwidth(self, sample_rate: int, bandwidth: Optional[float] = None) -> int:
        """Return n_q based on specified target bandwidth."""
        bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
        n_q = self.n_q
        if bandwidth and bandwidth > 0.0:
            n_q = int(max(1, math.floor(bandwidth / bw_per_q)))
        return n_q
    def get_bandwidth_per_quantizer(self, sample_rate: int):
        """Return bandwidth per quantizer for a given input sample rate."""
        return math.log2(self.bins) * sample_rate / 1000
    def decode(self, codes: torch.Tensor) -> torch.Tensor:
        """Decode the given codes to the quantized representation."""
        quantized = self.vq.decode(codes)
        return quantized
 class HiggsAudioTokenizer(nn.Module):
    def __init__(
        self,
        D: int = 256,
        target_bandwidths= [0.5, 1, 1.5, 2, 4],
        ratios = [8, 5, 4, 2, 3],  #  downsampling by 320
        sample_rate: int = 24000,
        bins: int = 1024,
        n_q: int = 8,
        codebook_dim: int = 64,
        last_layer_semantic: bool = True,
        downsample_mode: str = "step_down",
        vq_scale: int = 1,
        semantic_sample_rate: int = None,
        device = None,
        dtype = None,
        operations = None,
        **kwargs
    ):
        super().__init__()
        operations = operations or nn
        self.hop_length = np.prod(ratios)
        self.frame_rate = math.ceil(sample_rate / np.prod(ratios))  # 50 Hz
        self.target_bandwidths = target_bandwidths
        self.n_q = n_q
        self.sample_rate = sample_rate
        self.encoder = DACEncoder(64, ratios, D, device = device, dtype = dtype, operations = operations)
        self.decoder_2 = DACDecoder(D, 1024, ratios, device = device, dtype = dtype, operations = operations)
        self.last_layer_semantic = last_layer_semantic
        self.device = device
        self.semantic_model = AutoModel.from_pretrained("bosonai/hubert_base", trust_remote_code=True)
        self.semantic_sample_rate = 16000
        self.semantic_dim = 768
        self.encoder_semantic_dim = 768
        # Overwrite semantic model sr to ensure semantic_downsample_factor is an integer
        if semantic_sample_rate is not None:
            self.semantic_sample_rate = semantic_sample_rate
        self.semantic_model.eval()
        # make the semantic model parameters do not need gradient
        for param in self.semantic_model.parameters():
            param.requires_grad = False
        self.semantic_downsample_factor = int(self.hop_length / (self.sample_rate / self.semantic_sample_rate) / 320)
        self.quantizer_dim = int((D + self.encoder_semantic_dim) // vq_scale)
        self.encoder_semantic = Encoder(input_channels=self.semantic_dim, encode_channels=self.encoder_semantic_dim, device = device, dtype = dtype, operations = operations)
        self.decoder_semantic = Decoder(
            code_dim=self.encoder_semantic_dim, output_channels=self.semantic_dim, decode_channels=self.semantic_dim, device = device, dtype = dtype, operations = operations
        )
        self.quantizer = ResidualVectorQuantizer(
            dimension=self.quantizer_dim, codebook_dim=codebook_dim, n_q=n_q, bins=bins, device = device, dtype = dtype, operations = operations
        )
        self.fc_prior = operations.Linear(D + self.encoder_semantic_dim, self.quantizer_dim, device = device, dtype = dtype)
        self.fc_post1 = operations.Linear(self.quantizer_dim, self.encoder_semantic_dim, device = device, dtype = dtype)
        self.fc_post2 = operations.Linear(self.quantizer_dim, D, device = device, dtype = dtype)
        self.downsample_mode = downsample_mode
        self.audio_tokenizer_feature_extractor = HiggsAudioFeatureExtractor(sampling_rate=self.sample_rate)
    @property
    def sampling_rate(self):
        return self.sample_rate
    @torch.no_grad()
    def get_regress_target(self, x):
        x = torchaudio.functional.resample(x, self.sample_rate, self.semantic_sample_rate)
        x = x[:, 0, :]
        x = F.pad(x, (160, 160))
        target = self.semantic_model(x, output_hidden_states=True).hidden_states
        target = torch.stack(target, dim=1) 
        target = target.mean(1)
        if self.downsample_mode == "step_down":
            if self.semantic_downsample_factor > 1:
                target = target[:, :: self.semantic_downsample_factor, :]
        return target
    def forward(self):
        pass
    @property
    def tps(self):
        return self.frame_rate
    def encode(self, wv, sr):
        if sr != self.sampling_rate:
            # best computed values to match librosa's resample
            resampler_torch = torchaudio.transforms.Resample(
                orig_freq=sr,
                new_freq=self.sampling_rate,
                resampling_method="sinc_interp_kaiser",
                lowpass_filter_width = 121,
                rolloff = 0.9568384289091556,
                beta = 21.01531462440614
            ).to(wv.device)
            wv = resampler_torch(wv)
        if self.audio_tokenizer_feature_extractor is not None:
            inputs = self.audio_tokenizer_feature_extractor(wv)
            input_values = inputs["input_values"].to(self.device)
        else:
            input_values = torch.from_numpy(wv).float().unsqueeze(0)
        with torch.no_grad():
            input_values = input_values.to(wv.device)
            encoder_outputs = self._xcodec_encode(input_values)
            vq_code = encoder_outputs[0]
        return vq_code
    def _xcodec_encode(self, x: torch.Tensor, target_bw: Optional[int] = None) -> torch.Tensor:
        bw = target_bw
        e_semantic_input = self.get_regress_target(x).detach()
        e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
        e_acoustic = self.encoder(x)
        if e_acoustic.shape[2] != e_semantic.shape[2]:
            pad_size = 160 * self.semantic_downsample_factor
            e_acoustic = self.encoder(F.pad(x[:, 0, :], (pad_size, pad_size)).unsqueeze(0))
        if e_acoustic.shape[2] != e_semantic.shape[2]:
            if e_acoustic.shape[2] > e_semantic.shape[2]:
                e_acoustic = e_acoustic[:, :, : e_semantic.shape[2]]
            else:
                e_semantic = e_semantic[:, :, : e_acoustic.shape[2]]
        e = torch.cat([e_acoustic, e_semantic], dim=1)
        e = self.fc_prior(e.transpose(1, 2))
        e = e.transpose(1, 2)
        _, codes, _, _ = self.quantizer(e, self.frame_rate, bw)
        codes = codes.permute(1, 0, 2)
        return codes
    def decode(self, vq_code: torch.Tensor) -> torch.Tensor:
        vq_code = vq_code.to(self.device)
        if vq_code.ndim < 3:
            vq_code = vq_code.unsqueeze(0)
        vq_code = vq_code.permute(1, 0, 2)
        quantized = self.quantizer.decode(vq_code)
        quantized = quantized.transpose(1, 2)
        quantized_acoustic = self.fc_post2(quantized).transpose(1, 2)
        o = self.decoder_2(quantized_acoustic)
        return o.detach()
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -42,6 +42,7 @@ import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.higgsv2.model
 import comfy.model_management
 import comfy.patcher_extension
@ -1277,3 +1278,7 @@ class Omnigen2(BaseModel):
        if ref_latents is not None:
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out
 class Higgsv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_model=comfy.ldm.higgsv2.model.HiggsAudioModel):
        super().__init__(model_config, model_type, device, unet_model)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -388,6 +388,87 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config
    if f"{key_prefix}t_embedder.mlp.2.weight" in state_dict_keys:  # Hunyuan 3D 2.1
        dit_config = {}
        dit_config["image_model"] = "hunyuan3d2_1"
        dit_config["in_channels"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[1]
        dit_config["context_dim"] = 1024
        dit_config["hidden_size"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[0]
        dit_config["mlp_ratio"] = 4.0
        dit_config["num_heads"] = 16
        dit_config["depth"] = count_blocks(state_dict_keys, f"{key_prefix}blocks.{{}}")
        dit_config["qkv_bias"] = False
        dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys
        return dit_config
    if "{}layers.27.audio_post_attention_layernorm.weight".format(key_prefix) in state_dict_keys:
        autoregressive_config = {}
        autoregressive_config["image_model"] = "higgsv2"
        autoregressive_config["audio_adapter_type"] = "dual_ffn_fast_forward"
        autoregressive_config["audio_bos_token"] = "<|audio_bos|>"
        autoregressive_config["audio_codebook_size"] = 1024
        autoregressive_config["audio_num_codebooks"] = 8
        autoregressive_config["audio_ffn_hidden_size"] = 3072
        autoregressive_config["audio_ffn_intermediate_size"] = 8192
        autoregressive_config["audio_in_token"] = "<|AUDIO|>"
        autoregressive_config["audio_in_token_idx"] = 128015
        autoregressive_config["audio_out_token"] = "<|AUDIO_OUT|>"
        autoregressive_config["audio_out_token_idx"] = 128016
        autoregressive_config["audio_out_bos_token"] = "<|audio_out_bos|>"
        autoregressive_config["audio_out_bos_token_id"] = 128013
        autoregressive_config["audio_eos_token"] = "<|audio_eos|>"
        autoregressive_config["audio_eos_token_id"] = 128012
        autoregressive_config["audio_stream_bos_id"] = 1024
        autoregressive_config["audio_stream_eos_id"] = 1025
        autoregressive_config["encode_audio_in_tokens"] = True
        autoregressive_config["pad_token_id"] = 128001
        autoregressive_config["padding_idx"] = 128001
        autoregressive_config["hidden_size"] = 3072
        autoregressive_config["use_delay_pattern"] = True
        autoregressive_config["vocab_size"] = 128256
        autoregressive_config["num_hidden_layers"] = 28
        autoregressive_config["num_attention_heads"] = 24
        autoregressive_config["num_key_value_heads"] = 8
        autoregressive_config["max_seq_len"] = 131072
        autoregressive_config["max_position_embeddings"] = 131072
        autoregressive_config["bos_token_id"] = 128000
        autoregressive_config["eos_token_id"] = 128001
        autoregressive_config["use_cache"] = True
        autoregressive_config["text_config"] = {
            "model_type": "llama",
            "vocab_size": 128256,
            "max_position_embeddings": 131072,
            "num_hidden_layers": 28,
            "hidden_size": 3072,
            "num_attention_heads": 24,
            "num_key_value_heads": 8,
            "initializer_range": 0.02,
            "rms_norm_eps": 1e-05,
            "pad_token_id": None,
            "bos_token_id": 128000,
            "eos_token_id": 128001,
            "num_return_sequences": 1,
            "head_dim": 128,
            "mlp_bias": False,
            "intermediate_size": 8192
        }
        autoregressive_config["use_kv_buckets"] = True
        autoregressive_config["num_attention_heads"] = 24
        autoregressive_config["audio_decoder_proj_num_layers"] = 0
        autoregressive_config["audio_dual_ffn_layers"] = list(range(28))
        autoregressive_config["output_vae"] = False
        return autoregressive_config
    if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys:  # HiDream
        dit_config = {}
        dit_config["image_model"] = "hidream"
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1041,6 +1041,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
    output_vae = model_config.unet_config.get("output_vae", output_vae)
    if model_config.clip_vision_prefix is not None:
        if output_clipvision:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -19,6 +19,7 @@ import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.ace
 import comfy.text_encoders.omnigen2
 import comfy.text_encoders.higgsv2
 from . import supported_models_base
 from . import latent_formats
@ -1216,7 +1217,22 @@ class Omnigen2(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.LuminaTokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
 class Higgsv2(supported_models_base.BASE):
    unet_config = {
        "image_model": "higgsv2",
    }
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2]
+    memory_usage_factor = 1.0
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
    text_encoder_key_prefix = ["dac."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Higgsv2(self, device=device)
        return out    
    def clip_target(self, state_dict = {}):
        return supported_models_base.ClipTarget(comfy.text_encoders.higgsv2.DummyTokenizer, comfy.text_encoders.higgsv2.HiggsTokenizer)
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, Higgsv2]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/higgs_text_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/higgs_text_tokenizer/special_tokens_map.json
@ -0,0 +1,16 @@
 {
  "bos_token": {
    "content": "<|begin_of_text|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<|end_of_text|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/comfy/text_encoders/higgs_text_tokenizer/tokenizer.json
+++ b/comfy/text_encoders/higgs_text_tokenizer/tokenizer.json
--- a/comfy/text_encoders/higgs_text_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/higgs_text_tokenizer/tokenizer_config.json
--- a/comfy/text_encoders/higgsv2.py
+++ b/comfy/text_encoders/higgsv2.py
@ -0,0 +1,80 @@
 import os
 import torch
 import comfy.ops
 import torch.nn as nn
 from transformers import AutoTokenizer
 from comfy.ldm.higgsv2.tokenizer import HiggsAudioTokenizer
 from comfy.ldm.higgsv2.preprocess import HiggsAudioSampleCollator
 class DummyTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        pass
 def revert_delay_pattern_vectorized(data: torch.Tensor) -> torch.Tensor:
    num_codebooks, total_len = data.shape
    seq_len = total_len - num_codebooks + 1
    col_idx = torch.arange(seq_len, device=data.device)[None, :] \
             + torch.arange(num_codebooks, device=data.device)[:, None]
    out = data[torch.arange(num_codebooks)[:, None], col_idx]
    return out
 class HiggsTokenizer(nn.Module):
    def __init__(self, device, dtype, model_options={}, **kwargs):
        super().__init__()
        self.dtype = torch.float32
        self.device = device
        self.dtypes = [torch.float32]
        here = os.path.dirname(__file__) 
        tokenizer_path = os.path.join(here, "higgs_text_tokenizer")
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        scaled_fp8 = model_options.get("scaled_fp8", None)
        if scaled_fp8 is not None:
            operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
        else:
            operations = comfy.ops.manual_cast
        self.audio_codebook_size = 1024
        self.audio_tokenizer = HiggsAudioTokenizer(device = device, dtype = dtype, operations = operations)
        if scaled_fp8 is not None:
            self.audio_tokenizer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8))
        self.collator = HiggsAudioSampleCollator(
            audio_in_token_id = 128015,
            audio_out_token_id = 128016,
            audio_stream_bos_id = 1024,
            audio_stream_eos_id = 1025,
            pad_token_id = 128001,
            return_audio_in_tokens = False,
            use_delay_pattern = True,
            audio_num_codebooks = 8,
            round_to = 1,
        )
        postfix = "<|start_header_id|>assistant<|end_header_id|>\n\n"
        self.postfix = postfix + "<|audio_out_bos|>" # force audio generation
    def decode_tokens(self, audio_tokens):
        outputs = []
        # due to instability issues, I had to convert the audio tokenizer to float32, avoiding outputing nans
        self.audio_tokenizer = self.audio_tokenizer.to(self.dtype)
        torch.cuda.synchronize()
        for audio in audio_tokens:
            vq_code = revert_delay_pattern_vectorized(audio).clip(0, self.audio_codebook_size - 1)[:, 1:-1]
            wv_numpy = self.audio_tokenizer.decode(vq_code.unsqueeze(0))[0, 0]
            outputs.append(wv_numpy)
        return (None, {"waveform": torch.stack(outputs, dim = 0).unsqueeze(1), "sample_rate": self.audio_tokenizer.sample_rate}) # audio only
    def load_state_dict(self, sd, strict = False):
        return self.audio_tokenizer.load_state_dict(sd, strict = strict)
    def state_dict(self):
        return self.audio_tokenizer.state_dict()
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -1,10 +1,11 @@
 import torch
 import torch.nn as nn
 from dataclasses import dataclass
 from typing import Optional, Any
 from dataclasses import dataclass, field
 from transformers.cache_utils import Cache
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.model_management
 import comfy.ldm.common_dit
 import comfy.model_management
@ -21,10 +22,20 @@ class Llama2Config:
    rms_norm_eps: float = 1e-5
    rope_theta: float = 500000.0
    transformer_type: str = "llama"
-    head_dim = 128
+    head_dim: int = 128
    rms_norm_add = False
    mlp_activation = "silu"
-    qkv_bias = False
+    qkv_bias: bool = False
    rope_type: str = "llama3"
    rope_scaling: dict = field(
        default_factory=lambda: {
            "factor": 32.0,
            "high_freq_factor": 4.0,
            "low_freq_factor": 1.0,
            "original_max_position_embeddings": 8192,
            "rope_type": "llama3"
        }
    )
@dataclass
 class Qwen25_3BConfig:
@ -103,15 +114,67 @@ def apply_rope(xq, xk, freqs_cis):
    sin = freqs_cis[1].unsqueeze(1)
    q_embed = (xq * cos) + (rotate_half(xq) * sin)
    k_embed = (xk * cos) + (rotate_half(xk) * sin)
-    return q_embed, k_embed
+    return q_embed, k_embed, sin, cos
 class LlamaRoPE(nn.Module):
    def __init__(self, config, device = None, dtype = None):
        super().__init__()
        if config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_type)
        else:
            self.rope_type = "default"
        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq
    def _dynamic_frequency_update(self, position_ids, device):
        seq_len = torch.max(position_ids) + 1
        if seq_len > self.max_seq_len_cached:
            inv_freq, self.attention_scaling = self.rope_init_fn(
                self.config, device, seq_len=seq_len, **self.rope_kwargs
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)
            self.max_seq_len_cached = seq_len
        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len
    @torch.no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        position_ids_expanded = position_ids[:, None, :].float()
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling
        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 class Attention(nn.Module):
-    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
+    def __init__(self, config: Llama2Config, layer_idx: int = None, device=None, dtype=None, ops: Any = None):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.num_kv_heads = config.num_key_value_heads
        self.hidden_size = config.hidden_size
        self.layer_idx = layer_idx
        self.head_dim = config.head_dim
        self.inner_size = self.num_heads * self.head_dim
@ -126,6 +189,8 @@ class Attention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        freqs_cis: Optional[torch.Tensor] = None,
        optimized_attention=None,
    ):
@ -138,13 +203,22 @@ class Attention(nn.Module):
        xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
        xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
-        xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis)
+        xq, xk, sin, cos = apply_rope(xq, xk, freqs_cis=freqs_cis)
        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
            xk, xv = past_key_value.update(xk, xv, self.layer_idx, cache_kwargs)
        xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
        xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
        output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True)
-        return self.o_proj(output)
+        out = self.o_proj(output)
        if past_key_value is not None:
            return out, past_key_value
        return out
 class MLP(nn.Module):
    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@ -1,6 +1,7 @@
 from __future__ import annotations
 import av
 import re
 import torchaudio
 import torch
 import comfy.model_management
@ -8,11 +9,247 @@ import folder_paths
 import os
 import io
 import json
 import base64
 import random
 import hashlib
 import numpy as np
 import node_helpers
 from io import BytesIO
 from comfy.cli_args import args
 from comfy.comfy_types import IO
 from comfy.comfy_types import FileLocator
 from dataclasses import  asdict
 from comfy.ldm.higgsv2.loudness import loudness
 from comfy.ldm.higgsv2.preprocess import (
    prepare_chatml_sample, Message, ChatMLSample, ChatMLDatasetSample, AudioContent, TextContent, transcript_normalize
 )
 AUDIO_PLACEHOLDER_TOKEN = "<|__AUDIO_PLACEHOLDER__|>"
 MULTISPEAKER_DEFAULT_SYSTEM_MESSAGE = """You are an AI assistant designed to convert text into speech.
 If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
 If no speaker tag is present, select a suitable voice on your own."""
 class LoudnessNormalization:
    CATEGORY = "audio"
    RETURN_TYPES = ("AUDIO",)
    FUNCTION = "normalize"
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"audio": ("AUDIO", ),
                             "block_size": ("FLOAT", {"default": 0.400, "min": 0.1, "max": 1.0, "step": 0.05}),
                             "loudness_threshold": ("FLOAT", {"default": -23.0, "min": -70.0, "max": 0.0, "step": 0.5,
                                                              "tooltip": "Target loudness in LUFS. Common values are -23.0 (broadcast), -14.0 (streaming)."})}}
    def normalize(self, audio, loudness_threshold, block_size):
        sampling_rate = audio["sample_rate"]
        waveform = audio["waveform"]
        return {"waveform": loudness(waveform, sampling_rate, target_loudness = loudness_threshold, block_size = block_size), "sample_rate": sampling_rate}
 def prepare_chatml_input(
    clip,
    input_tokens,
    audio_contents,
    sampling_rate,
    postfix_str: str = "",
 ):
    if hasattr(clip, "postfix"):
        postfix_str = clip.postfix
    if postfix_str:
        postfix = clip.tokenizer.encode(postfix_str, add_special_tokens=False)
        input_tokens.extend(postfix)
    audio_ids_l = []
    if audio_contents is not None:
        if not hasattr(clip, "audio_tokenizer"):
            raise ValueError("This model does not have an audio tokenizer. The chosen model may not support ChatML Format")
        for audio_content in audio_contents:
            audio_content.raw_audio = audio_content.raw_audio.squeeze(0)
            if audio_content.raw_audio.shape[0] == 2:
                audio_content.raw_audio = audio_content.raw_audio.mean(dim = 0, keepdim = True)
            if audio_content.raw_audio.device != next(clip.audio_tokenizer.parameters()).device:
                audio_content.raw_audio = audio_content.raw_audio.to(next(clip.audio_tokenizer.parameters()).device)
            audio_ids = clip.audio_tokenizer.encode(audio_content.raw_audio, sampling_rate)
            audio_ids_l.append(audio_ids.squeeze(0))
    if len(audio_ids_l) > 0:
        audio_ids_start = torch.tensor(
            np.cumsum(np.array([0] + [audio_ids.shape[1] for audio_ids in audio_ids_l])),
            dtype=torch.long,
            device=audio_contents[0].raw_audio.device,
        ).to("cpu")[0:-1]
        audio_ids_concat = torch.cat(audio_ids_l, dim=1).to("cpu")
    else:
        audio_ids_start = None
        audio_ids_concat = None
    sample = ChatMLDatasetSample(
        input_ids=torch.LongTensor(input_tokens),
        label_ids=None,
        audio_ids_concat=audio_ids_concat,
        audio_ids_start=audio_ids_start,
        audio_waveforms_concat=None,
        audio_waveforms_start=None,
        audio_sample_rate=None,
        audio_speaker_indices=None,
    )
    if hasattr(clip, "collator"):
        sample.input_ids = sample.input_ids.cpu()
        sample = clip.collator([sample])
    inputs = asdict(sample)
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(clip.device)
    return inputs
 def postprocess_chatml(text: str) -> str:
    speakers = set(re.findall(r'\[SPEAKER\d+\]', text))
    skip_recon = True
    if len(speakers) > 1:
        parts = text.split('<|eot_id|>')
        # keep the first <|eot_id|> and the last one
        first_eot = parts[0] + '<|eot_id|>'
        middle_parts = ''.join(parts[1:-1]) 
        last_eot = '<|eot_id|>' + parts[-1]
        text = first_eot + middle_parts + last_eot
        skip_recon = False
    return text, skip_recon
 class CreateChatMLSample:
    def __init__(self):
        self.device = comfy.model_management.intermediate_device()
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "text": (IO.STRING, {
                    "default": "SYSTEM: " + MULTISPEAKER_DEFAULT_SYSTEM_MESSAGE + "\n\n<|scene_desc_start|>\nSPEAKER0:masculine\nSPEAKER1:feminine\n<|scene_desc_end|>",
                    "multiline": True,
                    "dynamicPrompts": True,
                    "tooltip": (
                        "The conversations to be encoded. "
                        "To register a conversation start with SPEAKER-0: some text. "
                        "To add a system prompt start with system:"
                    ),
                }),
                "clip": (IO.CLIP, {"tooltip": "The CLIP model used for tokenizing the text."}),
            },
            "optional": {
                "audio": (IO.AUDIO, {
                    "tooltip": "An audio clip to be inserted into the conversation. To register add [audio]",
                })
            }
        }
    RETURN_TYPES = ("TOKENS",)
    OUTPUT_TOOLTIPS = ("Turns text and audio into a ChatML Format.",)
    FUNCTION = "convert_to_ml_format"
    CATEGORY = "conditioning"
    def convert_to_ml_format(self, clip, text, audio=None):
        if audio is not None:
            clip.load_model()
        if hasattr(clip, "cond_stage_model"):
            clip = clip.cond_stage_model
        text = transcript_normalize(text)
        messages = []
        lines = text.splitlines()
        sampling_rate = False
        current_role = None
        collecting_system = False
        system_buffer = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            # system start
            if line.lower().startswith("system:"):
                collecting_system = True
                system_buffer.append(line[len("system:"):].strip())
                continue
            # while collecting system prompt
            if collecting_system:
                system_buffer.append(line)
                if "<|scene_desc_end|>" in line or "SPEAKER-" in line:
                    system_prompt = "\n".join(system_buffer)# + "\n<|scene_desc_end|>"
                    messages.append(Message(role="system", content=system_prompt))
                    system_buffer = []
                    collecting_system = False
                continue
            # speaker lines SPEAKER-0: text
            match = re.match(r"SPEAKER-(\d+):\s*(.*)", line, re.IGNORECASE)
            if match:
                speaker_id = match.group(1)
                content = match.group(2)
                current_role = f"[SPEAKER{speaker_id}] "
                messages.append(Message(role = "user", content = current_role + content.strip()))
            else:
                # continuation line goes to last speaker or instruction
                if current_role is not None and messages:
                    messages[-1].content += "\n" + line
        # return normal input_ids
        if not (len(messages) >= 1):
            return (clip.tokenizer(text),)
        all_text = "".join(msg.content for msg in messages if msg.role == "user")
        # postprocess to allow multi-user speech
        all_text, skip_recon = postprocess_chatml(all_text)
        if not skip_recon:
            lines = all_text.splitlines()
            messages = [messages[0]] if messages[0].role == "system" else []
            current_role = None
            for line in lines:
                match = re.match(r'\[SPEAKER\d+\]', line)
                if match:
                    current_role = match.group(0)
                    messages.append(Message(role="user", content=line.strip()))
                else:
                    if current_role and messages:
                        messages[-1].content += "\n" + line.strip()
        if audio is not None:
            # for audio cloning, the first message is a transcript, second is the audio,
            # third is the request of what the model should say
            waveform = audio["waveform"]
            sampling_rate = audio["sample_rate"]
            messages.insert(1, Message(
                role = "assistant",
                content = AudioContent(raw_audio = waveform, audio_url = "placeholder")
            ))
        chat_ml_sample = ChatMLSample(messages)
        input_tokens, audio_contents, _ = prepare_chatml_sample(
            chat_ml_sample,
            clip.tokenizer,
        )
        if audio is None:
            audio_contents = None
        out = prepare_chatml_input(clip, input_tokens, audio_contents, sampling_rate = sampling_rate)
        return (out,)
 class EmptyLatentAudio:
    def __init__(self):
@ -331,6 +568,8 @@ NODE_CLASS_MAPPINGS = {
    "LoadAudio": LoadAudio,
    "PreviewAudio": PreviewAudio,
    "ConditioningStableAudio": ConditioningStableAudio,
    "LoudnessNormalization": LoudnessNormalization,
    "CreateChatMLSample": CreateChatMLSample
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
@ -342,4 +581,6 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "SaveAudio": "Save Audio (FLAC)",
    "SaveAudioMP3": "Save Audio (MP3)",
    "SaveAudioOpus": "Save Audio (Opus)",
    "LoudnessNormalization": "Loudness Normalization",
    "CreateChatMLSample": "Create ChatML Sample"
 }
--- a/nodes.py
+++ b/nodes.py
@ -25,6 +25,7 @@ import comfy.sample
 import comfy.sd
 import comfy.utils
 import comfy.controlnet
 from comfy.autoregressive_sampling import auto_sample
 from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator
 import comfy.clip_vision
@ -1549,6 +1550,54 @@ class KSamplerAdvanced:
            disable_noise = True
        return common_ksampler(model, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=denoise, disable_noise=disable_noise, start_step=start_at_step, last_step=end_at_step, force_full_denoise=force_full_denoise)
 class AutoRegressiveGeneration:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "model": ("MODEL", {"tooltip": "The model used for generation."}),
                "input_ids": ("TOKENS", ),
                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True, "tooltip": "The random seed used for controling the generation."}),
                "max_new_length": ("INT", {"default": 1024, "min": 1, "max": 10_000, "tooltip": "The max length for generation."}),
                "min_new_length": ("INT", {"default": 1, "min": 1, "max": 10_000, "tooltip": "The min length for generation."}),
                "top_k": ("INT", {"default": 50, "min": 1, "max": 30_000, "tooltip": "Takes the top k of the most probable tokens."}),
                "top_p": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01, "tooltip": "Percentage of tokens to leave after generation (top most probable tokens)."}),
                "temperature": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 50, "step": 0.01, "tooltip": "Temperature controls randomess by decreasing or increasing the probability of lesser likely tokens. Higher Temperature -> More Randomness"}),
                "do_sample": ("BOOLEAN", {"default": False, "tooltip": "Add randomness in decoding the tokens."}),
            }
        }
    RETURN_TYPES = ("TOKENS",)
    FUNCTION = "generate"
    CATEGORY = "sampling"
    # for cuda graphs
    _cached_autoregressive_sampler = None
    def generate(self, model, input_ids, seed, max_new_length, min_new_length, top_k, top_p, temperature, do_sample):
        return (auto_sample(self, model, input_ids, max_new_length, min_new_length, top_k, top_p, temperature, do_sample, seed = seed),)
 class DecodeTokens:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "clip": (IO.CLIP, {"tooltip": "The model used for generation."}),
                "tokens": ("TOKENS", ),}
        }
    FUNCTION = "decode"
    CATEGORY = "conditioning"
    RETURN_TYPES = ("TEXT", "AUDIO")
    def decode(self, clip, tokens):
        clip.load_model()
        if hasattr(clip.cond_stage_model, "decode_tokens"): # for special tokenizers
            return clip.cond_stage_model.decode_tokens(tokens)
        else:
            return (clip.tokenizer.decode(tokens, skip_special_tokens=True), None)
 class SaveImage:
    def __init__(self):
        self.output_dir = folder_paths.get_output_directory()
@ -1936,9 +1985,11 @@ class ImagePadForOutpaint:
 NODE_CLASS_MAPPINGS = {
    "KSampler": KSampler,
    "AutoRegressiveGeneration": AutoRegressiveGeneration,
    "CheckpointLoaderSimple": CheckpointLoaderSimple,
    "CLIPTextEncode": CLIPTextEncode,
    "CLIPSetLastLayer": CLIPSetLastLayer,
    "DecodeTokens": DecodeTokens,
    "VAEDecode": VAEDecode,
    "VAEEncode": VAEEncode,
    "VAEEncodeForInpaint": VAEEncodeForInpaint,
@ -2008,6 +2059,8 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    # Sampling
    "KSampler": "KSampler",
    "KSamplerAdvanced": "KSampler (Advanced)",
    "AutoRegressiveGeneration": "Autoregressive Generation",
    ""
    # Loaders
    "CheckpointLoader": "Load Checkpoint With Config (DEPRECATED)",
    "CheckpointLoaderSimple": "Load Checkpoint",
@ -2025,6 +2078,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "StyleModelApply": "Apply Style Model",
    "CLIPTextEncode": "CLIP Text Encode (Prompt)",
    "CLIPSetLastLayer": "CLIP Set Last Layer",
    "DecodeTokens": "Decode Tokens",
    "ConditioningCombine": "Conditioning (Combine)",
    "ConditioningAverage ": "Conditioning (Average)",
    "ConditioningConcat": "Conditioning (Concat)",