Spaces:

rafaaa2105
/

Crisper-Whisper

Runtime error

App Files Files Community

rafaaa2105 commited on Oct 6

Commit

f9600ca

verified ·

1 Parent(s): f0d997e

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -67

app.py CHANGED Viewed

@@ -4,15 +4,17 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import spaces
 import numpy as np
 from pydub import AudioSegment
-import io
 import tempfile
 import os
-# Model configuration - Using Whisper with settings optimized for verbatim transcription
-MODEL_NAME = "openai/whisper-large-v3"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 # Load model and processor
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
@@ -24,25 +26,27 @@ model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-# Create pipeline with verbatim-optimized settings
 pipe = pipeline(
     "automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    max_new_tokens=384,
     chunk_length_s=30,
     batch_size=16,
-    return_timestamps=True,
     torch_dtype=torch_dtype,
     device=device,
 )
 def get_audio_duration(audio_path):
     """Get duration of audio file in seconds."""
     try:
         audio = AudioSegment.from_file(audio_path)
-        return len(audio) / 1000.0  # Convert ms to seconds
     except:
         return None
@@ -59,7 +63,6 @@ def slice_audio(audio_path, chunk_duration=300):
     for i in range(0, duration_ms, chunk_duration_ms):
         chunk = audio[i:i + chunk_duration_ms]
-        # Export chunk to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
             chunk.export(temp_file.name, format="wav")
             chunks.append(temp_file.name)
@@ -67,36 +70,31 @@ def slice_audio(audio_path, chunk_duration=300):
     return chunks
 @spaces.GPU
-def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False):
     """
-    Transcribe a single audio chunk with verbatim settings.
     """
-    # Configure pipeline parameters for VERBATIM transcription
     generate_kwargs = {
         "task": task,
-        "language": language,
-        # Verbatim transcription settings - only use supported parameters
-        "return_timestamps": "word" if return_timestamps else False,
     }
-    # Transcribe with verbatim settings
     result = pipe(audio_input, generate_kwargs=generate_kwargs)
     return result
 def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
-    Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
-    Automatically slices long audio files and processes in batches.
-    Args:
-        audio: Audio input (file path or numpy array)
-        task: Either "transcribe" or "translate" (to English)
-        return_timestamps: Whether to return word-level timestamps
-        language: Language code (None for auto-detect)
-        progress: Gradio progress tracker
-    Returns:
-        Verbatim transcription text and optional timestamp information
     """
     if audio is None:
         return "Please provide an audio file or recording."
@@ -108,9 +106,7 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
         if isinstance(audio, str):
             audio_path = audio
         elif isinstance(audio, tuple):
-            # Gradio microphone input format: (sample_rate, audio_data)
             sr, audio_data = audio
-            # Save to temporary file for processing
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                 import scipy.io.wavfile
                 scipy.io.wavfile.write(temp_file.name, sr, audio_data)
@@ -137,10 +133,9 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
         for idx, chunk_path in enumerate(audio_chunks):
             progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
-            result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
             if return_timestamps and "chunks" in result:
-                # Add chunk offset to timestamps
                 chunk_offset = idx * chunk_duration
                 chunk_text = result["text"]
                 timestamp_text = []
@@ -168,7 +163,7 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
         # Combine all transcriptions
         full_text = " ".join([t["text"] for t in all_transcriptions])
-        output = f"**Transcription:**\n{full_text}\n"
         if return_timestamps:
             output += "\n**Word-level Timestamps:**\n"
@@ -231,18 +226,23 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         """
         # 🎙️ Very Verbatim Multilingual Speech-to-Text
-        Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
-        **Verbatim Transcription Features:**
-        - ✅ Captures hesitations (um, uh, ah, eh)
-        - ✅ Transcribes filler words and false starts
-        - ✅ Includes repetitions and stutters
-        - ✅ Preserves natural speech patterns with word-level timestamps
-        - ✅ Supports 99+ languages
-        - ✅ **Automatic chunking for long audio files** (processes in 5-minute segments)
-        **Note:** Whisper Large V3 naturally captures disfluencies when using word-level timestamps.
-        The model transcribes speech as naturally as possible including hesitations and non-lexical sounds.
         """
     )
@@ -259,7 +259,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                     choices=["transcribe", "translate"],
                     value="transcribe",
                     label="Task",
-                    info="Transcribe in original language or translate to English"
                 )
                 language_dropdown = gr.Dropdown(
@@ -270,47 +270,62 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 )
             timestamps_checkbox = gr.Checkbox(
-                label="Return word-level timestamps",
-                value=False
             )
             transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
         with gr.Column():
             output_text = gr.Textbox(
-                label="Verbatim Transcription",
                 lines=20,
                 show_copy_button=True,
-                placeholder="Your verbatim transcription will appear here...\n\nLong audio files will be automatically processed in chunks."
             )
     gr.Markdown(
         """
-        ### What Makes This "Very Verbatim"?
-        Whisper Large V3 with word-level timestamps naturally provides verbatim transcription:
-        - **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
-        - **Preserves fillers**: "like", "you know", "I mean"
-        - **Shows false starts**: "I was- I went to the store"
-        - **Captures repetitions**: "I I I think that..."
-        - **Word-level precision**: Exact timestamps for every word
-        - **Handles long audio**: Automatically slices files longer than 5 minutes
         ### Use Cases
-        - Legal transcription requiring exact wording
-        - Linguistic analysis of natural speech
-        - Conversational AI training data
-        - Medical/therapeutic session transcripts
-        - Interview transcription with speaker mannerisms
-        - Research requiring disfluency analysis
-        - Podcast and long-form content transcription
         ### Tips for Best Results
-        - Use clear audio with minimal background noise
-        - Ensure consistent audio levels
-        - For very noisy environments, pre-process audio
-        - Specify language manually if auto-detect misidentifies
-        - Long files are automatically chunked (no length limit!)
         """
     )

 import spaces
 import numpy as np
 from pydub import AudioSegment
 import tempfile
 import os
+# Model configuration - Using CrisperWhisper for TRUE verbatim transcription
+# CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
+MODEL_NAME = "nyrahealth/CrisperWhisper"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+print(f"Loading {MODEL_NAME} for verbatim transcription...")
 # Load model and processor
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME,
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+# Create pipeline optimized for verbatim output
 pipe = pipeline(
     "automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    max_new_tokens=448,
     chunk_length_s=30,
     batch_size=16,
+    return_timestamps="word",  # CrisperWhisper provides accurate word-level timestamps
     torch_dtype=torch_dtype,
     device=device,
 )
+print("Model loaded successfully!")
 def get_audio_duration(audio_path):
     """Get duration of audio file in seconds."""
     try:
         audio = AudioSegment.from_file(audio_path)
+        return len(audio) / 1000.0
     except:
         return None
     for i in range(0, duration_ms, chunk_duration_ms):
         chunk = audio[i:i + chunk_duration_ms]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
             chunk.export(temp_file.name, format="wav")
             chunks.append(temp_file.name)
     return chunks
 @spaces.GPU
+def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
     """
+    Transcribe a single audio chunk with CrisperWhisper.
+    This model is specifically trained for verbatim transcription.
     """
     generate_kwargs = {
         "task": task,
     }
+    if language:
+        generate_kwargs["language"] = language
+    # CrisperWhisper automatically provides verbatim transcription
     result = pipe(audio_input, generate_kwargs=generate_kwargs)
     return result
 def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
+    Transcribe audio with VERY VERBATIM output using CrisperWhisper.
+    CrisperWhisper transcribes every spoken word exactly as it is, including:
+    - Fillers (um, uh, ah, er, mm)
+    - Pauses and hesitations
+    - Stutters and repetitions
+    - False starts
+    - Non-standard utterances
     """
     if audio is None:
         return "Please provide an audio file or recording."
         if isinstance(audio, str):
             audio_path = audio
         elif isinstance(audio, tuple):
             sr, audio_data = audio
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                 import scipy.io.wavfile
                 scipy.io.wavfile.write(temp_file.name, sr, audio_data)
         for idx, chunk_path in enumerate(audio_chunks):
             progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
+            result = transcribe_audio_chunk(chunk_path, task, language)
             if return_timestamps and "chunks" in result:
                 chunk_offset = idx * chunk_duration
                 chunk_text = result["text"]
                 timestamp_text = []
         # Combine all transcriptions
         full_text = " ".join([t["text"] for t in all_transcriptions])
+        output = f"**Verbatim Transcription:**\n{full_text}\n"
         if return_timestamps:
             output += "\n**Word-level Timestamps:**\n"
         """
         # 🎙️ Very Verbatim Multilingual Speech-to-Text
+        Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration.
+        ## 🔥 TRUE Verbatim Transcription
+        Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**:
+        - ✅ **Fillers**: um, uh, ah, er, mm, like, you know
+        - ✅ **Hesitations**: pauses, breath sounds, stutters
+        - ✅ **False Starts**: "I was- I went to the store"
+        - ✅ **Repetitions**: "I I I think that..."
+        - ✅ **Disfluencies**: Every non-fluent speech element
+        - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
+        - ✅ **Multilingual**: Supports 99+ languages
+        - ✅ **Long Audio Support**: Automatic 5-minute chunking
+        **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
+        conversational AI training, or any use case requiring exact speech capture.
         """
     )
                     choices=["transcribe", "translate"],
                     value="transcribe",
                     label="Task",
+                    info="Transcribe verbatim or translate to English"
                 )
                 language_dropdown = gr.Dropdown(
                 )
             timestamps_checkbox = gr.Checkbox(
+                label="Show word-level timestamps",
+                value=True,
+                info="Display precise timing for each word"
             )
             transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
         with gr.Column():
             output_text = gr.Textbox(
+                label="Verbatim Transcription (includes all um, uh, hesitations)",
                 lines=20,
                 show_copy_button=True,
+                placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
             )
     gr.Markdown(
         """
+        ### Why CrisperWhisper for Verbatim?
+        **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
+        - ❌ Removes "um", "uh", "ah"
+        - ❌ Omits false starts
+        - ❌ Skips repetitions
+        - ❌ Ignores stutters
+        **CrisperWhisper** is specifically trained for verbatim transcription:
+        - ✅ Keeps every filler word
+        - ✅ Preserves all disfluencies
+        - ✅ Captures exact speech patterns
+        - ✅ Accurate timestamps around hesitations
+        ### Example Comparison
+        **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
+        **Standard Whisper:** "So I was thinking that we could go to the store"
+        **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
         ### Use Cases
+        - **Legal/Court Transcription**: Exact wording required by law
+        - **Linguistic Research**: Study of natural speech patterns and disfluencies
+        - **Medical/Therapy Sessions**: Capturing patient speech patterns
+        - **Interview Transcription**: Preserving speaker mannerisms
+        - **Conversational AI Training**: Realistic dialogue data
+        - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing
+        - **Language Learning**: Analyzing natural spoken language
         ### Tips for Best Results
+        - Clear audio with minimal background noise works best
+        - The model captures quiet speech - ensure consistent audio levels
+        - Manual language selection can improve accuracy
+        - Long files are automatically processed in 5-minute chunks
+        - Timestamps help identify exact moments of hesitations
         """
     )