import os import gradio as gr import transformers import numpy as np import librosa import spaces # --------------------------- # Quiet OpenMP noise on Spaces # --------------------------- os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" os.environ["VECLIB_MAXIMUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" # --------------------------- # Model config # --------------------------- MODEL_ID = "fixie-ai/ultravox-v0_6-llama-3_1-8b" REVISION = "main" TARGET_SR = 16000 # Ultravox examples use 16k audio # --------------------------- # Global pipeline (lazy-loaded) # --------------------------- pipe = None def load_model(): """Load the Ultravox v0.6 pipeline (8B).""" global pipe if pipe is not None: return "✅ Model already loaded!" try: print(f"Loading Ultravox model: {MODEL_ID} (revision={REVISION})") pipe = transformers.pipeline( model=MODEL_ID, trust_remote_code=True, # required for Ultravox custom pipeline device_map="auto", dtype="auto", revision=REVISION, ) print("✅ Pipeline loaded successfully!") return "✅ Model pipeline loaded successfully!" except Exception as e: err = f"❌ Error loading model: {e}" print(err) return err # --------------------------- # Audio utilities # --------------------------- def load_audio_from_gradio(audio_input): """ Supports both gr.Audio types: - type="numpy" -> (sample_rate, np.ndarray) - type="filepath" -> "/tmp/....wav" Returns (audio: float32 mono @ 16k, sr: int) """ if isinstance(audio_input, tuple): sr, audio = audio_input elif isinstance(audio_input, str): # Read from tmp filepath audio, sr = librosa.load(audio_input, sr=None) else: raise ValueError(f"Unsupported audio input type: {type(audio_input)}") # Ensure float32 ndarray audio = np.asarray(audio, dtype=np.float32) # Stereo -> mono if audio.ndim > 1: audio = np.mean(audio, axis=1) # Trim leading/trailing silence (conservative) audio, _ = librosa.effects.trim(audio, top_db=30) # Remove DC offset if audio.size: audio = audio - float(np.mean(audio)) # Normalize peak to ~0.98 to improve quiet recordings peak = float(np.max(np.abs(audio))) if audio.size else 0.0 if peak > 0: audio = (0.98 / peak) * audio # Resample to 16k if sr != TARGET_SR: audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR) sr = TARGET_SR # Safety cap at 5 minutes max_sec = 300 if len(audio) / float(sr) > max_sec: audio = audio[: int(max_sec * sr)] return audio, sr # --------------------------- # Inference handler # --------------------------- @spaces.GPU def analyze_audio(audio_file, system_prompt): """ System prompt contains analysis instructions. Audio is processed using the <|audio|> placeholder token. """ global pipe if pipe is None: status = load_model() if status.startswith("❌"): return status if audio_file is None: return "❌ Please upload or record an audio file." # Load & preprocess audio try: audio, sr = load_audio_from_gradio(audio_file) except Exception as e: return f"❌ Failed to read/process audio: {e}" # Quick quality checks dur = len(audio) / float(sr) if sr else 0 rms = float(np.sqrt(np.mean(audio**2))) if audio.size else 0.0 if dur < 1.0: return "❌ Audio too short (<1s). Please upload a longer sample." if rms < 1e-3: return "❌ Audio extremely quiet. Increase mic gain or speak closer to the microphone." sys_text = (system_prompt or "You are a helpful assistant that analyzes the provided audio and explains what it contains.").strip() # Build turns: system message with user instructions + user message with audio token turns = [ {"role": "system", "content": sys_text}, {"role": "user", "content": "<|audio|>"} ] try: out = pipe( {"audio": audio, "turns": turns, "sampling_rate": sr}, max_new_tokens=1000, # increased for longer, more detailed responses ) text = out[0].get("generated_text", str(out)) if isinstance(out, list) and out else str(out) return f"✅ Processed.\n\n{text}" except Exception as e: return f"❌ Inference error: {e}" # --------------------------- # UI # --------------------------- startup_status = "⏳ Model loads on first request (8B is fairly quick)." with gr.Blocks(title="Ultravox v0.6 (8B) — Audio Analyzer", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎤 Ultravox v0.6 (8B) — Audio Analyzer Upload an audio file (or record) and provide **analysis instructions**. The instructions tell the AI what to analyze in the audio using the `<|audio|>` token. """) with gr.Row(): with gr.Column(): # For uploads, `filepath` is robust; mic also works. audio_input = gr.Audio( label="🎵 Upload or Record Audio", sources=["upload", "microphone"], type="filepath", # handler also supports numpy tuples ) system_prompt = gr.Textbox( label="🧠 Analysis Instructions (what should the AI analyze in the audio?)", value="You are a helpful assistant that analyzes the audio and describes what it contains.", lines=8, max_lines=20, ) submit_btn = gr.Button("🚀 Analyze", variant="primary") with gr.Column(): output = gr.Markdown( label="🤖 Model Response", value=f"**Model Status:** {startup_status}", ) submit_btn.click( fn=analyze_audio, inputs=[audio_input, system_prompt], outputs=output, ) if __name__ == "__main__": # Disabling SSR helps avoid upload quirks on Spaces demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)