Spaces:

gunnerforlife52
/

demo-voice

Running on Zero

App Files Files Community

demo-voice / app.py

gunnerforlife52

Update app.py

7d514e3 verified 3 months ago

raw

history blame contribute delete

6.24 kB

	import os
	import gradio as gr
	import transformers
	import numpy as np
	import librosa
	import spaces

	# ---------------------------
	# Quiet OpenMP noise on Spaces
	# ---------------------------
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["OPENBLAS_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"
	os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
	os.environ["NUMEXPR_NUM_THREADS"] = "1"

	# ---------------------------
	# Model config
	# ---------------------------
	MODEL_ID = "fixie-ai/ultravox-v0_6-llama-3_1-8b"
	REVISION = "main"
	TARGET_SR = 16000 # Ultravox examples use 16k audio

	# ---------------------------
	# Global pipeline (lazy-loaded)
	# ---------------------------
	pipe = None

	def load_model():
	"""Load the Ultravox v0.6 pipeline (8B)."""
	global pipe
	if pipe is not None:
	return "✅ Model already loaded!"

	try:
	print(f"Loading Ultravox model: {MODEL_ID} (revision={REVISION})")
	pipe = transformers.pipeline(
	model=MODEL_ID,
	trust_remote_code=True, # required for Ultravox custom pipeline
	device_map="auto",
	dtype="auto",
	revision=REVISION,
	)
	print("✅ Pipeline loaded successfully!")
	return "✅ Model pipeline loaded successfully!"
	except Exception as e:
	err = f"❌ Error loading model: {e}"
	print(err)
	return err


	# ---------------------------
	# Audio utilities
	# ---------------------------
	def load_audio_from_gradio(audio_input):
	"""
	Supports both gr.Audio types:
	- type="numpy" -> (sample_rate, np.ndarray)
	- type="filepath" -> "/tmp/....wav"
	Returns (audio: float32 mono @ 16k, sr: int)
	"""
	if isinstance(audio_input, tuple):
	sr, audio = audio_input
	elif isinstance(audio_input, str):
	# Read from tmp filepath
	audio, sr = librosa.load(audio_input, sr=None)
	else:
	raise ValueError(f"Unsupported audio input type: {type(audio_input)}")

	# Ensure float32 ndarray
	audio = np.asarray(audio, dtype=np.float32)

	# Stereo -> mono
	if audio.ndim > 1:
	audio = np.mean(audio, axis=1)

	# Trim leading/trailing silence (conservative)
	audio, _ = librosa.effects.trim(audio, top_db=30)

	# Remove DC offset
	if audio.size:
	audio = audio - float(np.mean(audio))

	# Normalize peak to ~0.98 to improve quiet recordings
	peak = float(np.max(np.abs(audio))) if audio.size else 0.0
	if peak > 0:
	audio = (0.98 / peak) * audio

	# Resample to 16k
	if sr != TARGET_SR:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
	sr = TARGET_SR

	# Safety cap at 5 minutes
	max_sec = 300
	if len(audio) / float(sr) > max_sec:
	audio = audio[: int(max_sec * sr)]

	return audio, sr


	# ---------------------------
	# Inference handler
	# ---------------------------
	@spaces.GPU
	def analyze_audio(audio_file, system_prompt):
	"""
	System prompt contains analysis instructions.
	Audio is processed using the <\|audio\|> placeholder token.
	"""
	global pipe

	if pipe is None:
	status = load_model()
	if status.startswith("❌"):
	return status

	if audio_file is None:
	return "❌ Please upload or record an audio file."

	# Load & preprocess audio
	try:
	audio, sr = load_audio_from_gradio(audio_file)
	except Exception as e:
	return f"❌ Failed to read/process audio: {e}"

	# Quick quality checks
	dur = len(audio) / float(sr) if sr else 0
	rms = float(np.sqrt(np.mean(audio**2))) if audio.size else 0.0
	if dur < 1.0:
	return "❌ Audio too short (<1s). Please upload a longer sample."
	if rms < 1e-3:
	return "❌ Audio extremely quiet. Increase mic gain or speak closer to the microphone."

	sys_text = (system_prompt or "You are a helpful assistant that analyzes the provided audio and explains what it contains.").strip()

	# Build turns: system message with user instructions + user message with audio token
	turns = [
	{"role": "system", "content": sys_text},
	{"role": "user", "content": "<\|audio\|>"}
	]

	try:
	out = pipe(
	{"audio": audio, "turns": turns, "sampling_rate": sr},
	max_new_tokens=1000, # increased for longer, more detailed responses
	)
	text = out[0].get("generated_text", str(out)) if isinstance(out, list) and out else str(out)
	return f"✅ Processed.\n\n{text}"
	except Exception as e:
	return f"❌ Inference error: {e}"


	# ---------------------------
	# UI
	# ---------------------------
	startup_status = "⏳ Model loads on first request (8B is fairly quick)."

	with gr.Blocks(title="Ultravox v0.6 (8B) — Audio Analyzer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎤 Ultravox v0.6 (8B) — Audio Analyzer
	Upload an audio file (or record) and provide analysis instructions.
	The instructions tell the AI what to analyze in the audio using the `<\|audio\|>` token.
	""")

	with gr.Row():
	with gr.Column():
	# For uploads, `filepath` is robust; mic also works.
	audio_input = gr.Audio(
	label="🎵 Upload or Record Audio",
	sources=["upload", "microphone"],
	type="filepath", # handler also supports numpy tuples
	)
	system_prompt = gr.Textbox(
	label="🧠 Analysis Instructions (what should the AI analyze in the audio?)",
	value="You are a helpful assistant that analyzes the audio and describes what it contains.",
	lines=8,
	max_lines=20,
	)
	submit_btn = gr.Button("🚀 Analyze", variant="primary")

	with gr.Column():
	output = gr.Markdown(
	label="🤖 Model Response",
	value=f"Model Status: {startup_status}",
	)

	submit_btn.click(
	fn=analyze_audio,
	inputs=[audio_input, system_prompt],
	outputs=output,
	)

	if __name__ == "__main__":
	# Disabling SSR helps avoid upload quirks on Spaces
	demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)