demo-voice / app.py
gunnerforlife52's picture
Update app.py
7d514e3 verified
import os
import gradio as gr
import transformers
import numpy as np
import librosa
import spaces
# ---------------------------
# Quiet OpenMP noise on Spaces
# ---------------------------
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
# ---------------------------
# Model config
# ---------------------------
MODEL_ID = "fixie-ai/ultravox-v0_6-llama-3_1-8b"
REVISION = "main"
TARGET_SR = 16000 # Ultravox examples use 16k audio
# ---------------------------
# Global pipeline (lazy-loaded)
# ---------------------------
pipe = None
def load_model():
"""Load the Ultravox v0.6 pipeline (8B)."""
global pipe
if pipe is not None:
return "βœ… Model already loaded!"
try:
print(f"Loading Ultravox model: {MODEL_ID} (revision={REVISION})")
pipe = transformers.pipeline(
model=MODEL_ID,
trust_remote_code=True, # required for Ultravox custom pipeline
device_map="auto",
dtype="auto",
revision=REVISION,
)
print("βœ… Pipeline loaded successfully!")
return "βœ… Model pipeline loaded successfully!"
except Exception as e:
err = f"❌ Error loading model: {e}"
print(err)
return err
# ---------------------------
# Audio utilities
# ---------------------------
def load_audio_from_gradio(audio_input):
"""
Supports both gr.Audio types:
- type="numpy" -> (sample_rate, np.ndarray)
- type="filepath" -> "/tmp/....wav"
Returns (audio: float32 mono @ 16k, sr: int)
"""
if isinstance(audio_input, tuple):
sr, audio = audio_input
elif isinstance(audio_input, str):
# Read from tmp filepath
audio, sr = librosa.load(audio_input, sr=None)
else:
raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
# Ensure float32 ndarray
audio = np.asarray(audio, dtype=np.float32)
# Stereo -> mono
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
# Trim leading/trailing silence (conservative)
audio, _ = librosa.effects.trim(audio, top_db=30)
# Remove DC offset
if audio.size:
audio = audio - float(np.mean(audio))
# Normalize peak to ~0.98 to improve quiet recordings
peak = float(np.max(np.abs(audio))) if audio.size else 0.0
if peak > 0:
audio = (0.98 / peak) * audio
# Resample to 16k
if sr != TARGET_SR:
audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
sr = TARGET_SR
# Safety cap at 5 minutes
max_sec = 300
if len(audio) / float(sr) > max_sec:
audio = audio[: int(max_sec * sr)]
return audio, sr
# ---------------------------
# Inference handler
# ---------------------------
@spaces.GPU
def analyze_audio(audio_file, system_prompt):
"""
System prompt contains analysis instructions.
Audio is processed using the <|audio|> placeholder token.
"""
global pipe
if pipe is None:
status = load_model()
if status.startswith("❌"):
return status
if audio_file is None:
return "❌ Please upload or record an audio file."
# Load & preprocess audio
try:
audio, sr = load_audio_from_gradio(audio_file)
except Exception as e:
return f"❌ Failed to read/process audio: {e}"
# Quick quality checks
dur = len(audio) / float(sr) if sr else 0
rms = float(np.sqrt(np.mean(audio**2))) if audio.size else 0.0
if dur < 1.0:
return "❌ Audio too short (<1s). Please upload a longer sample."
if rms < 1e-3:
return "❌ Audio extremely quiet. Increase mic gain or speak closer to the microphone."
sys_text = (system_prompt or "You are a helpful assistant that analyzes the provided audio and explains what it contains.").strip()
# Build turns: system message with user instructions + user message with audio token
turns = [
{"role": "system", "content": sys_text},
{"role": "user", "content": "<|audio|>"}
]
try:
out = pipe(
{"audio": audio, "turns": turns, "sampling_rate": sr},
max_new_tokens=1000, # increased for longer, more detailed responses
)
text = out[0].get("generated_text", str(out)) if isinstance(out, list) and out else str(out)
return f"βœ… Processed.\n\n{text}"
except Exception as e:
return f"❌ Inference error: {e}"
# ---------------------------
# UI
# ---------------------------
startup_status = "⏳ Model loads on first request (8B is fairly quick)."
with gr.Blocks(title="Ultravox v0.6 (8B) β€” Audio Analyzer", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎀 Ultravox v0.6 (8B) β€” Audio Analyzer
Upload an audio file (or record) and provide **analysis instructions**.
The instructions tell the AI what to analyze in the audio using the `<|audio|>` token.
""")
with gr.Row():
with gr.Column():
# For uploads, `filepath` is robust; mic also works.
audio_input = gr.Audio(
label="🎡 Upload or Record Audio",
sources=["upload", "microphone"],
type="filepath", # handler also supports numpy tuples
)
system_prompt = gr.Textbox(
label="🧠 Analysis Instructions (what should the AI analyze in the audio?)",
value="You are a helpful assistant that analyzes the audio and describes what it contains.",
lines=8,
max_lines=20,
)
submit_btn = gr.Button("πŸš€ Analyze", variant="primary")
with gr.Column():
output = gr.Markdown(
label="πŸ€– Model Response",
value=f"**Model Status:** {startup_status}",
)
submit_btn.click(
fn=analyze_audio,
inputs=[audio_input, system_prompt],
outputs=output,
)
if __name__ == "__main__":
# Disabling SSR helps avoid upload quirks on Spaces
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)