Crisper-Whisper / app.py
rafaaa2105's picture
Create app.py
ac96ee7 verified
raw
history blame
8.32 kB
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np
# Model configuration - Using Whisper with settings optimized for verbatim transcription
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
# Create pipeline with verbatim-optimized settings
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=448, # Increased for verbatim transcription
chunk_length_s=30,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
"""
Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
Configured to capture hesitations, fillers, non-words, and all spoken sounds.
Args:
audio: Audio input (file path or numpy array)
task: Either "transcribe" or "translate" (to English)
return_timestamps: Whether to return word-level timestamps
language: Language code (None for auto-detect)
Returns:
Verbatim transcription text and optional timestamp information
"""
if audio is None:
return "Please provide an audio file or recording."
try:
# Handle different audio input formats
if isinstance(audio, str):
audio_input = audio
elif isinstance(audio, tuple):
# Gradio microphone input format: (sample_rate, audio_data)
sr, audio_data = audio
audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
else:
audio_input = audio
# Configure pipeline parameters for VERBATIM transcription
generate_kwargs = {
"task": task,
"language": language,
# Verbatim transcription settings
"condition_on_previous_text": True, # Better context for non-words
"compression_ratio_threshold": 1.35, # Lower threshold to keep more content
"logprob_threshold": -1.0, # Keep lower probability tokens (hesitations, fillers)
"no_speech_threshold": 0.3, # Lower to capture quiet speech/sounds
"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # Temperature fallback for better coverage
"prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
}
if return_timestamps:
generate_kwargs["return_timestamps"] = "word"
# Transcribe with verbatim settings
result = pipe(audio_input, generate_kwargs=generate_kwargs)
# Format output
text = result["text"]
# Additional info
output = f"**Transcription:**\n{text}\n"
if return_timestamps and "chunks" in result:
output += "\n**Word-level Timestamps:**\n"
for chunk in result["chunks"]:
start = chunk["timestamp"][0]
end = chunk["timestamp"][1]
if start is not None and end is not None:
output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
return output
except Exception as e:
return f"Error during transcription: {str(e)}"
# Language options for manual selection
LANGUAGES = {
"Auto-detect": None,
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Korean": "ko",
"Arabic": "ar",
"Hindi": "hi",
"Turkish": "tr",
"Polish": "pl",
"Ukrainian": "uk",
"Vietnamese": "vi",
"Thai": "th",
"Indonesian": "id",
"Czech": "cs",
"Romanian": "ro",
"Swedish": "sv",
"Danish": "da",
"Norwegian": "no",
"Finnish": "fi",
"Greek": "el",
"Hebrew": "he",
}
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Very Verbatim Multilingual Speech-to-Text
Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
**Verbatim Transcription Features:**
- βœ… Captures hesitations (um, uh, ah, eh)
- βœ… Transcribes filler words and false starts
- βœ… Includes repetitions and stutters
- βœ… Attempts to transcribe non-standard words and sounds
- βœ… Preserves natural speech patterns
- βœ… Word-level timestamps for precise alignment
- βœ… Supports 99+ languages
**Note:** This is optimized for verbatim transcription, capturing speech as naturally
as possible including all disfluencies and non-lexical sounds.
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Audio Input"
)
with gr.Row():
task_radio = gr.Radio(
choices=["transcribe", "translate"],
value="transcribe",
label="Task",
info="Transcribe in original language or translate to English"
)
language_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="Auto-detect",
label="Language",
info="Select language or use auto-detect"
)
timestamps_checkbox = gr.Checkbox(
label="Return word-level timestamps",
value=False
)
transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
with gr.Column():
output_text = gr.Textbox(
label="Verbatim Transcription",
lines=20,
show_copy_button=True,
placeholder="Your verbatim transcription will appear here..."
)
gr.Markdown(
"""
### What Makes This "Very Verbatim"?
Unlike standard transcription that cleans up speech, this configuration:
- **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
- **Preserves fillers**: "like", "you know", "I mean"
- **Shows false starts**: "I was- I went to the store"
- **Captures repetitions**: "I I I think that..."
- **Includes non-words**: Attempts to phonetically transcribe sounds
- **Lower thresholds**: Captures quieter speech and partial words
### Use Cases
- Legal transcription requiring exact wording
- Linguistic analysis of natural speech
- Conversational AI training data
- Medical/therapeutic session transcripts
- Interview transcription with speaker mannerisms
- Research requiring disfluency analysis
### Tips for Best Results
- Use clear audio with minimal background noise
- Ensure consistent audio levels
- For very noisy environments, pre-process audio
- Specify language manually if auto-detect misidentifies
"""
)
# Set up event handler
def transcribe_wrapper(audio, task, timestamps, language_name):
language_code = LANGUAGES[language_name]
return transcribe_audio(audio, task, timestamps, language_code)
transcribe_btn.click(
fn=transcribe_wrapper,
inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
outputs=output_text
)
# Launch the app
if __name__ == "__main__":
demo.launch()