#!/usr/bin/env python3 """ Standalone script to generate TTS audio using VibeVoice. This script has NO ComfyUI dependencies and uses the models directly from HuggingFace. Based on Microsoft's VibeVoice: https://github.com/microsoft/VibeVoice Requirements: pip install torch transformers numpy scipy soundfile librosa huggingface-hub Usage: python generate_vibevoice_standalone.py """ import torch import numpy as np import soundfile as sf import os import random import re import logging from typing import Optional, List, Tuple from huggingface_hub import snapshot_download logging.basicConfig(level=logging.INFO, format='[VibeVoice] %(message)s') logger = logging.getLogger(__name__) try: import librosa LIBROSA_AVAILABLE = True except ImportError: logger.warning("librosa not available - resampling will not work") LIBROSA_AVAILABLE = False def set_seed(seed: int): """Set random seeds for reproducibility""" if seed == 0: seed = random.randint(1, 0xffffffffffffffff) MAX_NUMPY_SEED = 2**32 - 1 numpy_seed = seed % MAX_NUMPY_SEED torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) np.random.seed(numpy_seed) random.seed(seed) return seed def parse_script(script: str) -> Tuple[List[Tuple[int, str]], List[int]]: """ Parse speaker script into (speaker_id, text) tuples. Supports formats: [1] Some text... Speaker 1: Some text... Returns: parsed_lines: List of (0-based speaker_id, text) tuples speaker_ids: List of unique 1-based speaker IDs in order of appearance """ parsed_lines = [] speaker_ids_in_script = [] line_format_regex = re.compile(r'^(?:Speaker\s+(\d+)\s*:|\[(\d+)\])\s*(.*)$', re.IGNORECASE) for line in script.strip().split("\n"): line = line.strip() if not line: continue match = line_format_regex.match(line) if match: speaker_id_str = match.group(1) or match.group(2) speaker_id = int(speaker_id_str) text_content = match.group(3) if match.group(1) is None and text_content.lstrip().startswith(':'): colon_index = text_content.find(':') text_content = text_content[colon_index + 1:] if speaker_id < 1: logger.warning(f"Speaker ID must be 1 or greater. Skipping line: '{line}'") continue text = text_content.strip() internal_speaker_id = speaker_id - 1 parsed_lines.append((internal_speaker_id, text)) if speaker_id not in speaker_ids_in_script: speaker_ids_in_script.append(speaker_id) else: logger.warning(f"Could not parse speaker marker, ignoring: '{line}'") if not parsed_lines and script.strip(): logger.info("No speaker markers found. Treating entire text as Speaker 1.") parsed_lines.append((0, script.strip())) speaker_ids_in_script.append(1) return parsed_lines, sorted(list(set(speaker_ids_in_script))) def load_audio_file(audio_path: str, target_sr: int = 24000) -> Optional[np.ndarray]: """Load audio file and convert to mono at target sample rate""" if not os.path.exists(audio_path): logger.error(f"Audio file not found: {audio_path}") return None logger.info(f"Loading audio: {audio_path}") try: # Load audio using soundfile waveform, sr = sf.read(audio_path) # Convert to mono if stereo if waveform.ndim > 1: waveform = np.mean(waveform, axis=1) # Resample if needed if sr != target_sr: if not LIBROSA_AVAILABLE: raise ImportError("librosa is required for resampling. Install with: pip install librosa") logger.info(f"Resampling from {sr}Hz to {target_sr}Hz") waveform = librosa.resample(y=waveform, orig_sr=sr, target_sr=target_sr) # Validate audio if np.any(np.isnan(waveform)) or np.any(np.isinf(waveform)): logger.error("Audio contains NaN or Inf values, replacing with zeros") waveform = np.nan_to_num(waveform, nan=0.0, posinf=0.0, neginf=0.0) if np.all(waveform == 0): logger.warning("Audio waveform is completely silent") # Normalize extreme values max_val = np.abs(waveform).max() if max_val > 10.0: logger.warning(f"Audio values are very large (max: {max_val}), normalizing") waveform = waveform / max_val return waveform.astype(np.float32) except Exception as e: logger.error(f"Error loading audio: {e}") return None def download_model(model_name: str = "VibeVoice-1.5B", cache_dir: str = "./models"): """Download VibeVoice model from HuggingFace""" repo_mapping = { "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B", "VibeVoice-Large": "aoi-ot/VibeVoice-Large" } if model_name not in repo_mapping: raise ValueError(f"Unknown model: {model_name}. Choose from: {list(repo_mapping.keys())}") repo_id = repo_mapping[model_name] model_path = os.path.join(cache_dir, model_name) if os.path.exists(os.path.join(model_path, "config.json")): logger.info(f"Model already downloaded: {model_path}") return model_path logger.info(f"Downloading model from {repo_id}...") os.makedirs(cache_dir, exist_ok=True) model_path = snapshot_download( repo_id=repo_id, local_dir=model_path, local_dir_use_symlinks=False ) logger.info(f"Model downloaded to: {model_path}") return model_path def generate_tts( text: str, model_name: str = "VibeVoice-Large", speaker_audio_paths: Optional[dict] = None, output_path: str = "output.wav", cfg_scale: float = 1.3, inference_steps: int = 10, seed: int = 42, temperature: float = 0.95, top_p: float = 0.95, top_k: int = 0, cache_dir: str = "./models", device: str = "auto" ): """ Generate TTS audio using VibeVoice Args: text: Text script with speaker markers like "[1] text" or "Speaker 1: text" model_name: Model to use ("VibeVoice-1.5B" or "VibeVoice-Large") speaker_audio_paths: Dict mapping speaker IDs to audio file paths for voice cloning e.g., {1: "voice1.wav", 2: "voice2.wav"} output_path: Where to save the generated audio cfg_scale: Classifier-Free Guidance scale (higher = more adherence to prompt) inference_steps: Number of diffusion steps seed: Random seed for reproducibility temperature: Sampling temperature top_p: Nucleus sampling parameter top_k: Top-K sampling parameter cache_dir: Directory to cache downloaded models device: Device to use ("cuda", "mps", "cpu", or "auto" for automatic detection) """ # Set seed actual_seed = set_seed(seed) logger.info(f"Using seed: {actual_seed}") # Determine device - with MPS support for Mac if device == "auto": if torch.cuda.is_available(): device = "cuda" elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): device = "mps" logger.info("MPS (Metal Performance Shaders) detected - using Mac GPU acceleration") else: device = "cpu" logger.info(f"Using device: {device}") # Download model if needed model_path = download_model(model_name, cache_dir) # Import VibeVoice components logger.info("Loading VibeVoice model...") try: # Add the VibeVoice custom model code to path import sys vibevoice_custom_path = os.path.join(os.path.dirname(__file__), "custom_nodes", "ComfyUI-VibeVoice") if vibevoice_custom_path not in sys.path: sys.path.insert(0, vibevoice_custom_path) # Import custom VibeVoice model from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor from vibevoice.processor.vibevoice_tokenizer_processor import VibeVoiceTokenizerProcessor from vibevoice.modular.modular_vibevoice_text_tokenizer import VibeVoiceTextTokenizerFast from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig import json # Load config config_path = os.path.join(model_path, "config.json") config = VibeVoiceConfig.from_pretrained(config_path) # Load tokenizer - download if not present tokenizer_file = os.path.join(model_path, "tokenizer.json") if not os.path.exists(tokenizer_file): logger.info(f"tokenizer.json not found, downloading from HuggingFace...") from huggingface_hub import hf_hub_download # Determine which Qwen model to use based on model size qwen_repo = "Qwen/Qwen2.5-1.5B" if "1.5B" in model_name else "Qwen/Qwen2.5-7B" try: hf_hub_download( repo_id=qwen_repo, filename="tokenizer.json", local_dir=model_path, local_dir_use_symlinks=False ) logger.info("tokenizer.json downloaded successfully") except Exception as e: logger.error(f"Failed to download tokenizer.json: {e}") raise FileNotFoundError(f"Could not download tokenizer.json from {qwen_repo}") tokenizer = VibeVoiceTextTokenizerFast(tokenizer_file=tokenizer_file) # Load processor config preprocessor_config_path = os.path.join(model_path, "preprocessor_config.json") processor_config_data = {} if os.path.exists(preprocessor_config_path): with open(preprocessor_config_path, 'r') as f: processor_config_data = json.load(f) audio_processor = VibeVoiceTokenizerProcessor() processor = VibeVoiceProcessor( tokenizer=tokenizer, audio_processor=audio_processor, speech_tok_compress_ratio=processor_config_data.get("speech_tok_compress_ratio", 3200), db_normalize=processor_config_data.get("db_normalize", True) ) # Load model # MPS doesn't support bfloat16 well, use float16 if device == "mps": dtype = torch.float16 logger.info("Using float16 for MPS device") elif torch.cuda.is_available() and torch.cuda.is_bf16_supported(): dtype = torch.bfloat16 else: dtype = torch.float16 model = VibeVoiceForConditionalGenerationInference.from_pretrained( model_path, config=config, torch_dtype=dtype, device_map=device, attn_implementation="sdpa" ) model.eval() logger.info("Model loaded successfully") except Exception as e: logger.error(f"Failed to load model: {e}") import traceback traceback.print_exc() raise # Parse script parsed_lines, speaker_ids = parse_script(text) if not parsed_lines: raise ValueError("Script is empty or invalid") logger.info(f"Parsed {len(parsed_lines)} lines with speakers: {speaker_ids}") # Load speaker audio samples voice_samples = [] if speaker_audio_paths is None: speaker_audio_paths = {} for speaker_id in speaker_ids: audio_path = speaker_audio_paths.get(speaker_id) if audio_path: audio = load_audio_file(audio_path, target_sr=24000) if audio is None: logger.warning(f"Could not load audio for speaker {speaker_id}, using zero-shot TTS") voice_samples.append(None) else: voice_samples.append(audio) else: logger.info(f"No reference audio for speaker {speaker_id}, using zero-shot TTS") voice_samples.append(None) # Prepare inputs logger.info("Processing inputs...") try: inputs = processor( parsed_scripts=[parsed_lines], voice_samples=[voice_samples], speaker_ids_for_prompt=[speaker_ids], padding=True, return_tensors="pt", return_attention_mask=True ) # Move to device inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} except Exception as e: logger.error(f"Error processing inputs: {e}") raise # Configure generation model.set_ddpm_inference_steps(num_steps=inference_steps) generation_config = { 'do_sample': True, 'temperature': temperature, 'top_p': top_p, } if top_k > 0: generation_config['top_k'] = top_k # Generate logger.info(f"Generating audio ({inference_steps} steps)...") try: with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=processor.tokenizer, generation_config=generation_config, verbose=False ) # Extract waveform waveform = outputs.speech_outputs[0].cpu().numpy() # Ensure correct shape if waveform.ndim == 1: waveform = waveform.reshape(1, -1) elif waveform.ndim == 2 and waveform.shape[0] > 1: # If multiple channels, take first waveform = waveform[0:1, :] # Convert to float32 for soundfile compatibility waveform = waveform.astype(np.float32) # Save audio os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True) sf.write(output_path, waveform.T, 24000) logger.info(f"Audio saved to: {output_path}") return waveform except Exception as e: logger.error(f"Error during generation: {e}") raise def main(): """Example usage""" # Configuration model_name = "VibeVoice-Large" # or "VibeVoice-Large" # Text to generate - supports multiple speakers text = """ [1] Hello, this is speaker one. How are you today? [2] Hi there! This is speaker two responding to you. It's great to meet you. [1] Likewise! Let's generate some amazing speech together. [2] Absolutely! VibeVoice makes it so easy to create diverse voices. """ # Reference audio for voice cloning (optional) # If not provided, will use zero-shot TTS speaker_audio_paths = { 1: "input/audio1.wav", # Path to reference audio for speaker 1 2: "input/laundry.mp3", # Uncomment to provide reference for speaker 2 } # Generation parameters output_path = "output/vibevoice_generated.wav" cfg_scale = 1.3 inference_steps = 10 seed = 42 # or 0 for random temperature = 0.95 top_p = 0.95 top_k = 0 print("=" * 60) print("VibeVoice TTS - Standalone Script") print("=" * 60) print(f"Model: {model_name}") print(f"Text: {text[:100]}...") print("=" * 60) try: generate_tts( text=text, model_name=model_name, speaker_audio_paths=speaker_audio_paths, output_path=output_path, cfg_scale=cfg_scale, inference_steps=inference_steps, seed=seed, temperature=temperature, top_p=top_p, top_k=top_k, cache_dir="./models", device="auto" ) print("=" * 60) print("Generation complete!") print(f"Audio saved to: {output_path}") print("=" * 60) except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()