Spaces:

rafaaa2105
/

Crisper-Whisper

Runtime error

App Files Files Community

Crisper-Whisper / app.py

rafaaa2105

Create app.py

ac96ee7 verified 2 months ago

raw

history blame

8.32 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import spaces
	import numpy as np

	# Model configuration - Using Whisper with settings optimized for verbatim transcription
	MODEL_NAME = "openai/whisper-large-v3"
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Load model and processor
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(MODEL_NAME)

	# Create pipeline with verbatim-optimized settings
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=448, # Increased for verbatim transcription
	chunk_length_s=30,
	batch_size=16,
	torch_dtype=torch_dtype,
	device=device,
	)

	@spaces.GPU
	def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
	"""
	Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
	Configured to capture hesitations, fillers, non-words, and all spoken sounds.

	Args:
	audio: Audio input (file path or numpy array)
	task: Either "transcribe" or "translate" (to English)
	return_timestamps: Whether to return word-level timestamps
	language: Language code (None for auto-detect)

	Returns:
	Verbatim transcription text and optional timestamp information
	"""
	if audio is None:
	return "Please provide an audio file or recording."

	try:
	# Handle different audio input formats
	if isinstance(audio, str):
	audio_input = audio
	elif isinstance(audio, tuple):
	# Gradio microphone input format: (sample_rate, audio_data)
	sr, audio_data = audio
	audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
	else:
	audio_input = audio

	# Configure pipeline parameters for VERBATIM transcription
	generate_kwargs = {
	"task": task,
	"language": language,
	# Verbatim transcription settings
	"condition_on_previous_text": True, # Better context for non-words
	"compression_ratio_threshold": 1.35, # Lower threshold to keep more content
	"logprob_threshold": -1.0, # Keep lower probability tokens (hesitations, fillers)
	"no_speech_threshold": 0.3, # Lower to capture quiet speech/sounds
	"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # Temperature fallback for better coverage
	"prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
	}

	if return_timestamps:
	generate_kwargs["return_timestamps"] = "word"

	# Transcribe with verbatim settings
	result = pipe(audio_input, generate_kwargs=generate_kwargs)

	# Format output
	text = result["text"]

	# Additional info
	output = f"Transcription:\n{text}\n"

	if return_timestamps and "chunks" in result:
	output += "\nWord-level Timestamps:\n"
	for chunk in result["chunks"]:
	start = chunk["timestamp"][0]
	end = chunk["timestamp"][1]
	if start is not None and end is not None:
	output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"

	return output

	except Exception as e:
	return f"Error during transcription: {str(e)}"

	# Language options for manual selection
	LANGUAGES = {
	"Auto-detect": None,
	"English": "en",
	"Spanish": "es",
	"French": "fr",
	"German": "de",
	"Italian": "it",
	"Portuguese": "pt",
	"Dutch": "nl",
	"Russian": "ru",
	"Chinese": "zh",
	"Japanese": "ja",
	"Korean": "ko",
	"Arabic": "ar",
	"Hindi": "hi",
	"Turkish": "tr",
	"Polish": "pl",
	"Ukrainian": "uk",
	"Vietnamese": "vi",
	"Thai": "th",
	"Indonesian": "id",
	"Czech": "cs",
	"Romanian": "ro",
	"Swedish": "sv",
	"Danish": "da",
	"Norwegian": "no",
	"Finnish": "fi",
	"Greek": "el",
	"Hebrew": "he",
	}

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🎙️ Very Verbatim Multilingual Speech-to-Text

	Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.

	Verbatim Transcription Features:
	- ✅ Captures hesitations (um, uh, ah, eh)
	- ✅ Transcribes filler words and false starts
	- ✅ Includes repetitions and stutters
	- ✅ Attempts to transcribe non-standard words and sounds
	- ✅ Preserves natural speech patterns
	- ✅ Word-level timestamps for precise alignment
	- ✅ Supports 99+ languages

	Note: This is optimized for verbatim transcription, capturing speech as naturally
	as possible including all disfluencies and non-lexical sounds.
	"""
	)

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Audio Input"
	)

	with gr.Row():
	task_radio = gr.Radio(
	choices=["transcribe", "translate"],
	value="transcribe",
	label="Task",
	info="Transcribe in original language or translate to English"
	)

	language_dropdown = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="Auto-detect",
	label="Language",
	info="Select language or use auto-detect"
	)

	timestamps_checkbox = gr.Checkbox(
	label="Return word-level timestamps",
	value=False
	)

	transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")

	with gr.Column():
	output_text = gr.Textbox(
	label="Verbatim Transcription",
	lines=20,
	show_copy_button=True,
	placeholder="Your verbatim transcription will appear here..."
	)

	gr.Markdown(
	"""
	### What Makes This "Very Verbatim"?

	Unlike standard transcription that cleans up speech, this configuration:
	- Keeps hesitations: "um", "uh", "ah", "er", "mm"
	- Preserves fillers: "like", "you know", "I mean"
	- Shows false starts: "I was- I went to the store"
	- Captures repetitions: "I I I think that..."
	- Includes non-words: Attempts to phonetically transcribe sounds
	- Lower thresholds: Captures quieter speech and partial words

	### Use Cases
	- Legal transcription requiring exact wording
	- Linguistic analysis of natural speech
	- Conversational AI training data
	- Medical/therapeutic session transcripts
	- Interview transcription with speaker mannerisms
	- Research requiring disfluency analysis

	### Tips for Best Results
	- Use clear audio with minimal background noise
	- Ensure consistent audio levels
	- For very noisy environments, pre-process audio
	- Specify language manually if auto-detect misidentifies
	"""
	)

	# Set up event handler
	def transcribe_wrapper(audio, task, timestamps, language_name):
	language_code = LANGUAGES[language_name]
	return transcribe_audio(audio, task, timestamps, language_code)

	transcribe_btn.click(
	fn=transcribe_wrapper,
	inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
	outputs=output_text
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()