import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import spaces import numpy as np from pydub import AudioSegment import tempfile import os # Model configuration - Using CrisperWhisper for TRUE verbatim transcription # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts MODEL_NAME = "nyrahealth/CrisperWhisper" device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f"Loading {MODEL_NAME} for verbatim transcription...") # Load model and processor model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(MODEL_NAME) # Create pipeline optimized for verbatim output pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=30, batch_size=8, # Reduced batch size for stability return_timestamps="word", # CrisperWhisper provides accurate word-level timestamps torch_dtype=torch_dtype, device=device, ) print("Model loaded successfully!") def get_audio_duration(audio_path): """Get duration of audio file in seconds.""" try: audio = AudioSegment.from_file(audio_path) return len(audio) / 1000.0 except: return None def slice_audio(audio_path, chunk_duration=300): """ Slice audio into chunks of specified duration (in seconds). Default is 5 minutes (300 seconds) per chunk. """ audio = AudioSegment.from_file(audio_path) duration_ms = len(audio) chunk_duration_ms = chunk_duration * 1000 chunks = [] for i in range(0, duration_ms, chunk_duration_ms): chunk = audio[i:i + chunk_duration_ms] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: chunk.export(temp_file.name, format="wav") chunks.append(temp_file.name) return chunks @spaces.GPU def transcribe_audio_chunk(audio_input, task="transcribe", language=None): """ Transcribe a single audio chunk with CrisperWhisper. This model is specifically trained for verbatim transcription. """ try: generate_kwargs = { "task": task, } if language: generate_kwargs["language"] = language # CrisperWhisper automatically provides verbatim transcription result = pipe(audio_input, generate_kwargs=generate_kwargs) return result except Exception as e: # Fallback: try without generate_kwargs if there's a tensor mismatch print(f"Error with generate_kwargs: {e}") try: result = pipe(audio_input) return result except Exception as e2: raise Exception(f"Transcription failed: {str(e2)}") def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()): """ Transcribe audio with VERY VERBATIM output using CrisperWhisper. CrisperWhisper transcribes every spoken word exactly as it is, including: - Fillers (um, uh, ah, er, mm) - Pauses and hesitations - Stutters and repetitions - False starts - Non-standard utterances """ if audio is None: return "Please provide an audio file or recording." temp_files = [] try: # Handle different audio input formats if isinstance(audio, str): audio_path = audio elif isinstance(audio, tuple): sr, audio_data = audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: import scipy.io.wavfile scipy.io.wavfile.write(temp_file.name, sr, audio_data) audio_path = temp_file.name temp_files.append(audio_path) else: return "Unsupported audio format." # Check audio duration and slice if necessary duration = get_audio_duration(audio_path) chunk_duration = 300 # 5 minutes per chunk if duration and duration > chunk_duration: progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") audio_chunks = slice_audio(audio_path, chunk_duration) temp_files.extend(audio_chunks) else: audio_chunks = [audio_path] # Process each chunk all_transcriptions = [] total_chunks = len(audio_chunks) for idx, chunk_path in enumerate(audio_chunks): progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") result = transcribe_audio_chunk(chunk_path, task, language) if return_timestamps and "chunks" in result: chunk_offset = idx * chunk_duration chunk_text = result["text"] timestamp_text = [] for word_chunk in result["chunks"]: start = word_chunk["timestamp"][0] end = word_chunk["timestamp"][1] if start is not None and end is not None: timestamp_text.append({ "start": start + chunk_offset, "end": end + chunk_offset, "text": word_chunk["text"] }) all_transcriptions.append({ "text": chunk_text, "timestamps": timestamp_text }) else: all_transcriptions.append({ "text": result["text"], "timestamps": [] }) # Combine all transcriptions full_text = " ".join([t["text"] for t in all_transcriptions]) output = f"**Verbatim Transcription:**\n{full_text}\n" if return_timestamps: output += "\n**Word-level Timestamps:**\n" for trans in all_transcriptions: for ts in trans["timestamps"]: output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n" if duration: output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*" return output except Exception as e: return f"Error during transcription: {str(e)}" finally: # Clean up temporary files for temp_file in temp_files: try: if os.path.exists(temp_file): os.unlink(temp_file) except: pass # Language options for manual selection LANGUAGES = { "Auto-detect": None, "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Russian": "ru", "Chinese": "zh", "Japanese": "ja", "Korean": "ko", "Arabic": "ar", "Hindi": "hi", "Turkish": "tr", "Polish": "pl", "Ukrainian": "uk", "Vietnamese": "vi", "Thai": "th", "Indonesian": "id", "Czech": "cs", "Romanian": "ro", "Swedish": "sv", "Danish": "da", "Norwegian": "no", "Finnish": "fi", "Greek": "el", "Hebrew": "he", } # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎙️ Very Verbatim Multilingual Speech-to-Text Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration. ## 🔥 TRUE Verbatim Transcription Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**: - ✅ **Fillers**: um, uh, ah, er, mm, like, you know - ✅ **Hesitations**: pauses, breath sounds, stutters - ✅ **False Starts**: "I was- I went to the store" - ✅ **Repetitions**: "I I I think that..." - ✅ **Disfluencies**: Every non-fluent speech element - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies - ✅ **Multilingual**: Supports 99+ languages - ✅ **Long Audio Support**: Automatic 5-minute chunking **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, conversational AI training, or any use case requiring exact speech capture. """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Audio Input" ) with gr.Row(): task_radio = gr.Radio( choices=["transcribe", "translate"], value="transcribe", label="Task", info="Transcribe verbatim or translate to English" ) language_dropdown = gr.Dropdown( choices=list(LANGUAGES.keys()), value="Auto-detect", label="Language", info="Select language or use auto-detect" ) timestamps_checkbox = gr.Checkbox( label="Show word-level timestamps", value=True, info="Display precise timing for each word" ) transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg") with gr.Column(): output_text = gr.Textbox( label="Verbatim Transcription (includes all um, uh, hesitations)", lines=20, show_copy_button=True, placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!" ) gr.Markdown( """ ### Why CrisperWhisper for Verbatim? **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up: - ❌ Removes "um", "uh", "ah" - ❌ Omits false starts - ❌ Skips repetitions - ❌ Ignores stutters **CrisperWhisper** is specifically trained for verbatim transcription: - ✅ Keeps every filler word - ✅ Preserves all disfluencies - ✅ Captures exact speech patterns - ✅ Accurate timestamps around hesitations ### Example Comparison **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store" **Standard Whisper:** "So I was thinking that we could go to the store" **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store" ### Use Cases - **Legal/Court Transcription**: Exact wording required by law - **Linguistic Research**: Study of natural speech patterns and disfluencies - **Medical/Therapy Sessions**: Capturing patient speech patterns - **Interview Transcription**: Preserving speaker mannerisms - **Conversational AI Training**: Realistic dialogue data - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing - **Language Learning**: Analyzing natural spoken language ### Tips for Best Results - Clear audio with minimal background noise works best - The model captures quiet speech - ensure consistent audio levels - Manual language selection can improve accuracy - Long files are automatically processed in 5-minute chunks - Timestamps help identify exact moments of hesitations """ ) # Set up event handler def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()): language_code = LANGUAGES[language_name] return transcribe_audio(audio, task, timestamps, language_code, progress) transcribe_btn.click( fn=transcribe_wrapper, inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown], outputs=output_text ) # Launch the app if __name__ == "__main__": demo.launch()