Spaces:
Runtime error
Runtime error
File size: 8,321 Bytes
ac96ee7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np
# Model configuration - Using Whisper with settings optimized for verbatim transcription
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
# Create pipeline with verbatim-optimized settings
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=448, # Increased for verbatim transcription
chunk_length_s=30,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
"""
Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
Configured to capture hesitations, fillers, non-words, and all spoken sounds.
Args:
audio: Audio input (file path or numpy array)
task: Either "transcribe" or "translate" (to English)
return_timestamps: Whether to return word-level timestamps
language: Language code (None for auto-detect)
Returns:
Verbatim transcription text and optional timestamp information
"""
if audio is None:
return "Please provide an audio file or recording."
try:
# Handle different audio input formats
if isinstance(audio, str):
audio_input = audio
elif isinstance(audio, tuple):
# Gradio microphone input format: (sample_rate, audio_data)
sr, audio_data = audio
audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
else:
audio_input = audio
# Configure pipeline parameters for VERBATIM transcription
generate_kwargs = {
"task": task,
"language": language,
# Verbatim transcription settings
"condition_on_previous_text": True, # Better context for non-words
"compression_ratio_threshold": 1.35, # Lower threshold to keep more content
"logprob_threshold": -1.0, # Keep lower probability tokens (hesitations, fillers)
"no_speech_threshold": 0.3, # Lower to capture quiet speech/sounds
"temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # Temperature fallback for better coverage
"prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
}
if return_timestamps:
generate_kwargs["return_timestamps"] = "word"
# Transcribe with verbatim settings
result = pipe(audio_input, generate_kwargs=generate_kwargs)
# Format output
text = result["text"]
# Additional info
output = f"**Transcription:**\n{text}\n"
if return_timestamps and "chunks" in result:
output += "\n**Word-level Timestamps:**\n"
for chunk in result["chunks"]:
start = chunk["timestamp"][0]
end = chunk["timestamp"][1]
if start is not None and end is not None:
output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
return output
except Exception as e:
return f"Error during transcription: {str(e)}"
# Language options for manual selection
LANGUAGES = {
"Auto-detect": None,
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Korean": "ko",
"Arabic": "ar",
"Hindi": "hi",
"Turkish": "tr",
"Polish": "pl",
"Ukrainian": "uk",
"Vietnamese": "vi",
"Thai": "th",
"Indonesian": "id",
"Czech": "cs",
"Romanian": "ro",
"Swedish": "sv",
"Danish": "da",
"Norwegian": "no",
"Finnish": "fi",
"Greek": "el",
"Hebrew": "he",
}
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# ποΈ Very Verbatim Multilingual Speech-to-Text
Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
**Verbatim Transcription Features:**
- β
Captures hesitations (um, uh, ah, eh)
- β
Transcribes filler words and false starts
- β
Includes repetitions and stutters
- β
Attempts to transcribe non-standard words and sounds
- β
Preserves natural speech patterns
- β
Word-level timestamps for precise alignment
- β
Supports 99+ languages
**Note:** This is optimized for verbatim transcription, capturing speech as naturally
as possible including all disfluencies and non-lexical sounds.
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Audio Input"
)
with gr.Row():
task_radio = gr.Radio(
choices=["transcribe", "translate"],
value="transcribe",
label="Task",
info="Transcribe in original language or translate to English"
)
language_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="Auto-detect",
label="Language",
info="Select language or use auto-detect"
)
timestamps_checkbox = gr.Checkbox(
label="Return word-level timestamps",
value=False
)
transcribe_btn = gr.Button("π― Transcribe Verbatim", variant="primary", size="lg")
with gr.Column():
output_text = gr.Textbox(
label="Verbatim Transcription",
lines=20,
show_copy_button=True,
placeholder="Your verbatim transcription will appear here..."
)
gr.Markdown(
"""
### What Makes This "Very Verbatim"?
Unlike standard transcription that cleans up speech, this configuration:
- **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
- **Preserves fillers**: "like", "you know", "I mean"
- **Shows false starts**: "I was- I went to the store"
- **Captures repetitions**: "I I I think that..."
- **Includes non-words**: Attempts to phonetically transcribe sounds
- **Lower thresholds**: Captures quieter speech and partial words
### Use Cases
- Legal transcription requiring exact wording
- Linguistic analysis of natural speech
- Conversational AI training data
- Medical/therapeutic session transcripts
- Interview transcription with speaker mannerisms
- Research requiring disfluency analysis
### Tips for Best Results
- Use clear audio with minimal background noise
- Ensure consistent audio levels
- For very noisy environments, pre-process audio
- Specify language manually if auto-detect misidentifies
"""
)
# Set up event handler
def transcribe_wrapper(audio, task, timestamps, language_name):
language_code = LANGUAGES[language_name]
return transcribe_audio(audio, task, timestamps, language_code)
transcribe_btn.click(
fn=transcribe_wrapper,
inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
outputs=output_text
)
# Launch the app
if __name__ == "__main__":
demo.launch() |