from pydantic import BaseModel, Field


class MusicSection(BaseModel):
    section_name: str = Field(...)
    positive_local_styles: list[str] = Field(...)
    negative_local_styles: list[str] = Field(...)
    duration_ms: int = Field(...)
    lines: list[str] = Field(...)


class MusicPrompt(BaseModel):
    positive_global_styles: list[str] = Field(...)
    negative_global_styles: list[str] = Field(...)
    sections: list[MusicSection] = Field(...)


class ComposeMusicRequest(BaseModel):
    model_id: str = Field(...)
    prompt: str | None = Field(...)
    music_length_ms: int | None = Field(...)
    force_instrumental: bool | None = Field(...)
    composition_plan: MusicPrompt | None = Field(...)
    respect_sections_durations: bool | None = Field(...)
    output_format: str | None = Field(...)


class CreateCompositionPlanRequest(BaseModel):
    prompt: str = Field(...)
    music_length_ms: int | None = Field(...)
    model_id: str = Field(...)


class SpeechToTextRequest(BaseModel):
    model_id: str = Field(...)
    cloud_storage_url: str = Field(...)
    language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code")
    tag_audio_events: bool | None = Field(None, description="Annotate sounds like (laughter) in transcript")
    num_speakers: int | None = Field(None, description="Max speakers predicted")
    timestamps_granularity: str = Field(default="word", description="Timing precision: none, word, or character")
    diarize: bool | None = Field(None, description="Annotate which speaker is talking")
    diarization_threshold: float | None = Field(None, description="Speaker separation sensitivity")
    temperature: float | None = Field(None, description="Randomness control")
    seed: int = Field(..., description="Seed for deterministic sampling")


class SpeechToTextWord(BaseModel):
    text: str = Field(..., description="The word text")
    type: str = Field(default="word", description="Type of text element (word, spacing, etc.)")
    start: float | None = Field(None, description="Start time in seconds (when timestamps enabled)")
    end: float | None = Field(None, description="End time in seconds (when timestamps enabled)")
    speaker_id: str | None = Field(None, description="Speaker identifier when diarization is enabled")
    logprob: float | None = Field(None, description="Log probability of the word")


class SpeechToTextResponse(BaseModel):
    language_code: str = Field(..., description="Detected or specified language code")
    language_probability: float | None = Field(None, description="Confidence of language detection")
    text: str = Field(..., description="Full transcript text")
    words: list[SpeechToTextWord] | None = Field(None, description="Word-level timing information")


class TextToSpeechVoiceSettings(BaseModel):
    stability: float | None = Field(None, description="Voice stability")
    similarity_boost: float | None = Field(None, description="Similarity boost")
    style: float | None = Field(None, description="Style exaggeration")
    use_speaker_boost: bool | None = Field(None, description="Boost similarity to original speaker")
    speed: float | None = Field(None, description="Speech speed")


class TextToSpeechRequest(BaseModel):
    text: str = Field(..., description="Text to convert to speech")
    model_id: str = Field(..., description="Model ID for TTS")
    language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code")
    voice_settings: TextToSpeechVoiceSettings | None = Field(None, description="Voice settings")
    seed: int = Field(..., description="Seed for deterministic sampling")
    apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off")


class TextToSoundEffectsRequest(BaseModel):
    text: str = Field(..., description="Text prompt to convert into a sound effect")
    duration_seconds: float = Field(..., description="Duration of generated sound in seconds")
    prompt_influence: float = Field(..., description="How closely generation follows the prompt")
    loop: bool | None = Field(None, description="Whether to create a smoothly looping sound effect")


class AddVoiceRequest(BaseModel):
    name: str = Field(..., description="Name that identifies the voice")
    remove_background_noise: bool = Field(..., description="Remove background noise from voice samples")


class AddVoiceResponse(BaseModel):
    voice_id: str = Field(..., description="The newly created voice's unique identifier")


class SpeechToSpeechRequest(BaseModel):
    model_id: str = Field(..., description="Model ID for speech-to-speech")
    voice_settings: str = Field(..., description="JSON string of voice settings")
    seed: int = Field(..., description="Seed for deterministic sampling")
    remove_background_noise: bool = Field(..., description="Remove background noise from input audio")


class DialogueInput(BaseModel):
    text: str = Field(..., description="Text content to convert to speech")
    voice_id: str = Field(..., description="Voice identifier for this dialogue segment")


class DialogueSettings(BaseModel):
    stability: float | None = Field(None, description="Voice stability (0-1)")


class TextToDialogueRequest(BaseModel):
    inputs: list[DialogueInput] = Field(..., description="List of dialogue segments")
    model_id: str = Field(..., description="Model ID for dialogue generation")
    language_code: str | None = Field(None, description="ISO-639-1 language code")
    settings: DialogueSettings | None = Field(None, description="Voice settings")
    seed: int | None = Field(None, description="Seed for deterministic sampling")
    apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off")