Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import spaces | |
| import numpy as np | |
| from pydub import AudioSegment | |
| import tempfile | |
| import os | |
| # Model configuration - Using CrisperWhisper for TRUE verbatim transcription | |
| # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts | |
| MODEL_NAME = "nyrahealth/CrisperWhisper" | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| print(f"Loading {MODEL_NAME} for verbatim transcription...") | |
| # Load model and processor | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| # Create pipeline optimized for verbatim output | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| chunk_length_s=30, | |
| batch_size=8, # Reduced batch size for stability | |
| return_timestamps="word", # CrisperWhisper provides accurate word-level timestamps | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| print("Model loaded successfully!") | |
| def get_audio_duration(audio_path): | |
| """Get duration of audio file in seconds.""" | |
| try: | |
| audio = AudioSegment.from_file(audio_path) | |
| return len(audio) / 1000.0 | |
| except: | |
| return None | |
| def slice_audio(audio_path, chunk_duration=300): | |
| """ | |
| Slice audio into chunks of specified duration (in seconds). | |
| Default is 5 minutes (300 seconds) per chunk. | |
| """ | |
| audio = AudioSegment.from_file(audio_path) | |
| duration_ms = len(audio) | |
| chunk_duration_ms = chunk_duration * 1000 | |
| chunks = [] | |
| for i in range(0, duration_ms, chunk_duration_ms): | |
| chunk = audio[i:i + chunk_duration_ms] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| chunk.export(temp_file.name, format="wav") | |
| chunks.append(temp_file.name) | |
| return chunks | |
| def transcribe_audio_chunk(audio_input, task="transcribe", language=None): | |
| """ | |
| Transcribe a single audio chunk with CrisperWhisper. | |
| This model is specifically trained for verbatim transcription. | |
| """ | |
| try: | |
| generate_kwargs = { | |
| "task": task, | |
| } | |
| if language: | |
| generate_kwargs["language"] = language | |
| # CrisperWhisper automatically provides verbatim transcription | |
| result = pipe(audio_input, generate_kwargs=generate_kwargs) | |
| return result | |
| except Exception as e: | |
| # Fallback: try without generate_kwargs if there's a tensor mismatch | |
| print(f"Error with generate_kwargs: {e}") | |
| try: | |
| result = pipe(audio_input) | |
| return result | |
| except Exception as e2: | |
| raise Exception(f"Transcription failed: {str(e2)}") | |
| def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()): | |
| """ | |
| Transcribe audio with VERY VERBATIM output using CrisperWhisper. | |
| CrisperWhisper transcribes every spoken word exactly as it is, including: | |
| - Fillers (um, uh, ah, er, mm) | |
| - Pauses and hesitations | |
| - Stutters and repetitions | |
| - False starts | |
| - Non-standard utterances | |
| """ | |
| if audio is None: | |
| return "Please provide an audio file or recording." | |
| temp_files = [] | |
| try: | |
| # Handle different audio input formats | |
| if isinstance(audio, str): | |
| audio_path = audio | |
| elif isinstance(audio, tuple): | |
| sr, audio_data = audio | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| import scipy.io.wavfile | |
| scipy.io.wavfile.write(temp_file.name, sr, audio_data) | |
| audio_path = temp_file.name | |
| temp_files.append(audio_path) | |
| else: | |
| return "Unsupported audio format." | |
| # Check audio duration and slice if necessary | |
| duration = get_audio_duration(audio_path) | |
| chunk_duration = 300 # 5 minutes per chunk | |
| if duration and duration > chunk_duration: | |
| progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...") | |
| audio_chunks = slice_audio(audio_path, chunk_duration) | |
| temp_files.extend(audio_chunks) | |
| else: | |
| audio_chunks = [audio_path] | |
| # Process each chunk | |
| all_transcriptions = [] | |
| total_chunks = len(audio_chunks) | |
| for idx, chunk_path in enumerate(audio_chunks): | |
| progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...") | |
| result = transcribe_audio_chunk(chunk_path, task, language) | |
| if return_timestamps and "chunks" in result: | |
| chunk_offset = idx * chunk_duration | |
| chunk_text = result["text"] | |
| timestamp_text = [] | |
| for word_chunk in result["chunks"]: | |
| start = word_chunk["timestamp"][0] | |
| end = word_chunk["timestamp"][1] | |
| if start is not None and end is not None: | |
| timestamp_text.append({ | |
| "start": start + chunk_offset, | |
| "end": end + chunk_offset, | |
| "text": word_chunk["text"] | |
| }) | |
| all_transcriptions.append({ | |
| "text": chunk_text, | |
| "timestamps": timestamp_text | |
| }) | |
| else: | |
| all_transcriptions.append({ | |
| "text": result["text"], | |
| "timestamps": [] | |
| }) | |
| # Combine all transcriptions | |
| full_text = " ".join([t["text"] for t in all_transcriptions]) | |
| output = f"**Verbatim Transcription:**\n{full_text}\n" | |
| if return_timestamps: | |
| output += "\n**Word-level Timestamps:**\n" | |
| for trans in all_transcriptions: | |
| for ts in trans["timestamps"]: | |
| output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n" | |
| if duration: | |
| output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*" | |
| return output | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| finally: | |
| # Clean up temporary files | |
| for temp_file in temp_files: | |
| try: | |
| if os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| except: | |
| pass | |
| # Language options for manual selection | |
| LANGUAGES = { | |
| "Auto-detect": None, | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Russian": "ru", | |
| "Chinese": "zh", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Arabic": "ar", | |
| "Hindi": "hi", | |
| "Turkish": "tr", | |
| "Polish": "pl", | |
| "Ukrainian": "uk", | |
| "Vietnamese": "vi", | |
| "Thai": "th", | |
| "Indonesian": "id", | |
| "Czech": "cs", | |
| "Romanian": "ro", | |
| "Swedish": "sv", | |
| "Danish": "da", | |
| "Norwegian": "no", | |
| "Finnish": "fi", | |
| "Greek": "el", | |
| "Hebrew": "he", | |
| } | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # ποΈ Very Verbatim Multilingual Speech-to-Text | |
| Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration. | |
| ## π₯ TRUE Verbatim Transcription | |
| Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**: | |
| - β **Fillers**: um, uh, ah, er, mm, like, you know | |
| - β **Hesitations**: pauses, breath sounds, stutters | |
| - β **False Starts**: "I was- I went to the store" | |
| - β **Repetitions**: "I I I think that..." | |
| - β **Disfluencies**: Every non-fluent speech element | |
| - β **Accurate Word-Level Timestamps**: Precise timing even around disfluencies | |
| - β **Multilingual**: Supports 99+ languages | |
| - β **Long Audio Support**: Automatic 5-minute chunking | |
| **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, | |
| conversational AI training, or any use case requiring exact speech capture. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Audio Input" | |
| ) | |
| with gr.Row(): | |
| task_radio = gr.Radio( | |
| choices=["transcribe", "translate"], | |
| value="transcribe", | |
| label="Task", | |
| info="Transcribe verbatim or translate to English" | |
| ) | |
| language_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Auto-detect", | |
| label="Language", | |
| info="Select language or use auto-detect" | |
| ) | |
| timestamps_checkbox = gr.Checkbox( | |
| label="Show word-level timestamps", | |
| value=True, | |
| info="Display precise timing for each word" | |
| ) | |
| transcribe_btn = gr.Button("π― Transcribe Verbatim", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="Verbatim Transcription (includes all um, uh, hesitations)", | |
| lines=20, | |
| show_copy_button=True, | |
| placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Why CrisperWhisper for Verbatim? | |
| **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up: | |
| - β Removes "um", "uh", "ah" | |
| - β Omits false starts | |
| - β Skips repetitions | |
| - β Ignores stutters | |
| **CrisperWhisper** is specifically trained for verbatim transcription: | |
| - β Keeps every filler word | |
| - β Preserves all disfluencies | |
| - β Captures exact speech patterns | |
| - β Accurate timestamps around hesitations | |
| ### Example Comparison | |
| **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store" | |
| **Standard Whisper:** "So I was thinking that we could go to the store" | |
| **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store" | |
| ### Use Cases | |
| - **Legal/Court Transcription**: Exact wording required by law | |
| - **Linguistic Research**: Study of natural speech patterns and disfluencies | |
| - **Medical/Therapy Sessions**: Capturing patient speech patterns | |
| - **Interview Transcription**: Preserving speaker mannerisms | |
| - **Conversational AI Training**: Realistic dialogue data | |
| - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing | |
| - **Language Learning**: Analyzing natural spoken language | |
| ### Tips for Best Results | |
| - Clear audio with minimal background noise works best | |
| - The model captures quiet speech - ensure consistent audio levels | |
| - Manual language selection can improve accuracy | |
| - Long files are automatically processed in 5-minute chunks | |
| - Timestamps help identify exact moments of hesitations | |
| """ | |
| ) | |
| # Set up event handler | |
| def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()): | |
| language_code = LANGUAGES[language_name] | |
| return transcribe_audio(audio, task, timestamps, language_code, progress) | |
| transcribe_btn.click( | |
| fn=transcribe_wrapper, | |
| inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown], | |
| outputs=output_text | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |