import os
import gradio as gr
import transformers
import numpy as np
import librosa
import spaces

# ---------------------------
# Quiet OpenMP noise on Spaces
# ---------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# ---------------------------
# Model config
# ---------------------------
MODEL_ID = "fixie-ai/ultravox-v0_6-llama-3_1-8b"
REVISION = "main"
TARGET_SR = 16000  # Ultravox examples use 16k audio

# ---------------------------
# Global pipeline (lazy-loaded)
# ---------------------------
pipe = None

def load_model():
    """Load the Ultravox v0.6 pipeline (8B)."""
    global pipe
    if pipe is not None:
        return "✅ Model already loaded!"

    try:
        print(f"Loading Ultravox model: {MODEL_ID} (revision={REVISION})")
        pipe = transformers.pipeline(
            model=MODEL_ID,
            trust_remote_code=True,   # required for Ultravox custom pipeline
            device_map="auto",
            dtype="auto",
            revision=REVISION,
        )
        print("✅ Pipeline loaded successfully!")
        return "✅ Model pipeline loaded successfully!"
    except Exception as e:
        err = f"❌ Error loading model: {e}"
        print(err)
        return err


# ---------------------------
# Audio utilities
# ---------------------------
def load_audio_from_gradio(audio_input):
    """
    Supports both gr.Audio types:
      - type="numpy"    -> (sample_rate, np.ndarray)
      - type="filepath" -> "/tmp/....wav"
    Returns (audio: float32 mono @ 16k, sr: int)
    """
    if isinstance(audio_input, tuple):
        sr, audio = audio_input
    elif isinstance(audio_input, str):
        # Read from tmp filepath
        audio, sr = librosa.load(audio_input, sr=None)
    else:
        raise ValueError(f"Unsupported audio input type: {type(audio_input)}")

    # Ensure float32 ndarray
    audio = np.asarray(audio, dtype=np.float32)

    # Stereo -> mono
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    # Trim leading/trailing silence (conservative)
    audio, _ = librosa.effects.trim(audio, top_db=30)

    # Remove DC offset
    if audio.size:
        audio = audio - float(np.mean(audio))

    # Normalize peak to ~0.98 to improve quiet recordings
    peak = float(np.max(np.abs(audio))) if audio.size else 0.0
    if peak > 0:
        audio = (0.98 / peak) * audio

    # Resample to 16k
    if sr != TARGET_SR:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
        sr = TARGET_SR

    # Safety cap at 5 minutes
    max_sec = 300
    if len(audio) / float(sr) > max_sec:
        audio = audio[: int(max_sec * sr)]

    return audio, sr


# ---------------------------
# Inference handler
# ---------------------------
@spaces.GPU
def analyze_audio(audio_file, system_prompt):
    """
    System prompt contains analysis instructions.
    Audio is processed using the <|audio|> placeholder token.
    """
    global pipe

    if pipe is None:
        status = load_model()
        if status.startswith("❌"):
            return status

    if audio_file is None:
        return "❌ Please upload or record an audio file."

    # Load & preprocess audio
    try:
        audio, sr = load_audio_from_gradio(audio_file)
    except Exception as e:
        return f"❌ Failed to read/process audio: {e}"

    # Quick quality checks
    dur = len(audio) / float(sr) if sr else 0
    rms = float(np.sqrt(np.mean(audio**2))) if audio.size else 0.0
    if dur < 1.0:
        return "❌ Audio too short (<1s). Please upload a longer sample."
    if rms < 1e-3:
        return "❌ Audio extremely quiet. Increase mic gain or speak closer to the microphone."

    sys_text = (system_prompt or "You are a helpful assistant that analyzes the provided audio and explains what it contains.").strip()

    # Build turns: system message with user instructions + user message with audio token
    turns = [
        {"role": "system", "content": sys_text},
        {"role": "user", "content": "<|audio|>"}
    ]

    try:
        out = pipe(
            {"audio": audio, "turns": turns, "sampling_rate": sr},
            max_new_tokens=1000,   # increased for longer, more detailed responses
        )
        text = out[0].get("generated_text", str(out)) if isinstance(out, list) and out else str(out)
        return f"✅ Processed.\n\n{text}"
    except Exception as e:
        return f"❌ Inference error: {e}"


# ---------------------------
# UI
# ---------------------------
startup_status = "⏳ Model loads on first request (8B is fairly quick)."

with gr.Blocks(title="Ultravox v0.6 (8B) — Audio Analyzer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎤 Ultravox v0.6 (8B) — Audio Analyzer
    Upload an audio file (or record) and provide **analysis instructions**.
    The instructions tell the AI what to analyze in the audio using the `<|audio|>` token.
    """)

    with gr.Row():
        with gr.Column():
            # For uploads, `filepath` is robust; mic also works.
            audio_input = gr.Audio(
                label="🎵 Upload or Record Audio",
                sources=["upload", "microphone"],
                type="filepath",   # handler also supports numpy tuples
            )
            system_prompt = gr.Textbox(
                label="🧠 Analysis Instructions (what should the AI analyze in the audio?)",
                value="You are a helpful assistant that analyzes the audio and describes what it contains.",
                lines=8,
                max_lines=20,
            )
            submit_btn = gr.Button("🚀 Analyze", variant="primary")

        with gr.Column():
            output = gr.Markdown(
                label="🤖 Model Response",
                value=f"**Model Status:** {startup_status}",
            )

    submit_btn.click(
        fn=analyze_audio,
        inputs=[audio_input, system_prompt],
        outputs=output,
    )

if __name__ == "__main__":
    # Disabling SSR helps avoid upload quirks on Spaces
    demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)