Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import spaces | |
| import numpy as np | |
| # Model configuration - Using Whisper with settings optimized for verbatim transcription | |
| MODEL_NAME = "openai/whisper-large-v3" | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # Load model and processor | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| # Create pipeline with verbatim-optimized settings | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| max_new_tokens=448, # Increased for verbatim transcription | |
| chunk_length_s=30, | |
| batch_size=16, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None): | |
| """ | |
| Transcribe audio with very verbatim output using Whisper model with ZeroGPU. | |
| Configured to capture hesitations, fillers, non-words, and all spoken sounds. | |
| Args: | |
| audio: Audio input (file path or numpy array) | |
| task: Either "transcribe" or "translate" (to English) | |
| return_timestamps: Whether to return word-level timestamps | |
| language: Language code (None for auto-detect) | |
| Returns: | |
| Verbatim transcription text and optional timestamp information | |
| """ | |
| if audio is None: | |
| return "Please provide an audio file or recording." | |
| try: | |
| # Handle different audio input formats | |
| if isinstance(audio, str): | |
| audio_input = audio | |
| elif isinstance(audio, tuple): | |
| # Gradio microphone input format: (sample_rate, audio_data) | |
| sr, audio_data = audio | |
| audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr} | |
| else: | |
| audio_input = audio | |
| # Configure pipeline parameters for VERBATIM transcription | |
| generate_kwargs = { | |
| "task": task, | |
| "language": language, | |
| # Verbatim transcription settings | |
| "condition_on_previous_text": True, # Better context for non-words | |
| "compression_ratio_threshold": 1.35, # Lower threshold to keep more content | |
| "logprob_threshold": -1.0, # Keep lower probability tokens (hesitations, fillers) | |
| "no_speech_threshold": 0.3, # Lower to capture quiet speech/sounds | |
| "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # Temperature fallback for better coverage | |
| "prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.", | |
| } | |
| if return_timestamps: | |
| generate_kwargs["return_timestamps"] = "word" | |
| # Transcribe with verbatim settings | |
| result = pipe(audio_input, generate_kwargs=generate_kwargs) | |
| # Format output | |
| text = result["text"] | |
| # Additional info | |
| output = f"**Transcription:**\n{text}\n" | |
| if return_timestamps and "chunks" in result: | |
| output += "\n**Word-level Timestamps:**\n" | |
| for chunk in result["chunks"]: | |
| start = chunk["timestamp"][0] | |
| end = chunk["timestamp"][1] | |
| if start is not None and end is not None: | |
| output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n" | |
| return output | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| # Language options for manual selection | |
| LANGUAGES = { | |
| "Auto-detect": None, | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Russian": "ru", | |
| "Chinese": "zh", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Arabic": "ar", | |
| "Hindi": "hi", | |
| "Turkish": "tr", | |
| "Polish": "pl", | |
| "Ukrainian": "uk", | |
| "Vietnamese": "vi", | |
| "Thai": "th", | |
| "Indonesian": "id", | |
| "Czech": "cs", | |
| "Romanian": "ro", | |
| "Swedish": "sv", | |
| "Danish": "da", | |
| "Norwegian": "no", | |
| "Finnish": "fi", | |
| "Greek": "el", | |
| "Hebrew": "he", | |
| } | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # ποΈ Very Verbatim Multilingual Speech-to-Text | |
| Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration. | |
| **Verbatim Transcription Features:** | |
| - β Captures hesitations (um, uh, ah, eh) | |
| - β Transcribes filler words and false starts | |
| - β Includes repetitions and stutters | |
| - β Attempts to transcribe non-standard words and sounds | |
| - β Preserves natural speech patterns | |
| - β Word-level timestamps for precise alignment | |
| - β Supports 99+ languages | |
| **Note:** This is optimized for verbatim transcription, capturing speech as naturally | |
| as possible including all disfluencies and non-lexical sounds. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Audio Input" | |
| ) | |
| with gr.Row(): | |
| task_radio = gr.Radio( | |
| choices=["transcribe", "translate"], | |
| value="transcribe", | |
| label="Task", | |
| info="Transcribe in original language or translate to English" | |
| ) | |
| language_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Auto-detect", | |
| label="Language", | |
| info="Select language or use auto-detect" | |
| ) | |
| timestamps_checkbox = gr.Checkbox( | |
| label="Return word-level timestamps", | |
| value=False | |
| ) | |
| transcribe_btn = gr.Button("π― Transcribe Verbatim", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="Verbatim Transcription", | |
| lines=20, | |
| show_copy_button=True, | |
| placeholder="Your verbatim transcription will appear here..." | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### What Makes This "Very Verbatim"? | |
| Unlike standard transcription that cleans up speech, this configuration: | |
| - **Keeps hesitations**: "um", "uh", "ah", "er", "mm" | |
| - **Preserves fillers**: "like", "you know", "I mean" | |
| - **Shows false starts**: "I was- I went to the store" | |
| - **Captures repetitions**: "I I I think that..." | |
| - **Includes non-words**: Attempts to phonetically transcribe sounds | |
| - **Lower thresholds**: Captures quieter speech and partial words | |
| ### Use Cases | |
| - Legal transcription requiring exact wording | |
| - Linguistic analysis of natural speech | |
| - Conversational AI training data | |
| - Medical/therapeutic session transcripts | |
| - Interview transcription with speaker mannerisms | |
| - Research requiring disfluency analysis | |
| ### Tips for Best Results | |
| - Use clear audio with minimal background noise | |
| - Ensure consistent audio levels | |
| - For very noisy environments, pre-process audio | |
| - Specify language manually if auto-detect misidentifies | |
| """ | |
| ) | |
| # Set up event handler | |
| def transcribe_wrapper(audio, task, timestamps, language_name): | |
| language_code = LANGUAGES[language_name] | |
| return transcribe_audio(audio, task, timestamps, language_code) | |
| transcribe_btn.click( | |
| fn=transcribe_wrapper, | |
| inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown], | |
| outputs=output_text | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |