Spaces:

rafaaa2105
/

Crisper-Whisper

Runtime error

App Files Files Community

rafaaa2105 commited on Oct 6

Commit

9d3319a

verified ·

1 Parent(s): 4a11f6a

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -36

app.py CHANGED Viewed

@@ -3,6 +3,10 @@ import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import spaces
 import numpy as np
 # Model configuration - Using Whisper with settings optimized for verbatim transcription
 MODEL_NAME = "openai/whisper-large-v3"
@@ -26,24 +30,75 @@ pipe = pipeline(
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    max_new_tokens=448,  # Increased for verbatim transcription
     chunk_length_s=30,
     batch_size=16,
     torch_dtype=torch_dtype,
     device=device,
 )
 @spaces.GPU
-def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
     """
     Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
-    Configured to capture hesitations, fillers, non-words, and all spoken sounds.
     Args:
         audio: Audio input (file path or numpy array)
         task: Either "transcribe" or "translate" (to English)
         return_timestamps: Whether to return word-level timestamps
         language: Language code (None for auto-detect)
     Returns:
         Verbatim transcription text and optional timestamp information
@@ -51,54 +106,97 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
     if audio is None:
         return "Please provide an audio file or recording."
     try:
         # Handle different audio input formats
         if isinstance(audio, str):
-            audio_input = audio
         elif isinstance(audio, tuple):
             # Gradio microphone input format: (sample_rate, audio_data)
             sr, audio_data = audio
-            audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
         else:
-            audio_input = audio
-        # Configure pipeline parameters for VERBATIM transcription
-        generate_kwargs = {
-            "task": task,
-            "language": language,
-            # Verbatim transcription settings
-            "condition_on_previous_text": True,  # Better context for non-words
-            "compression_ratio_threshold": 1.35,  # Lower threshold to keep more content
-            "logprob_threshold": -1.0,  # Keep lower probability tokens (hesitations, fillers)
-            "no_speech_threshold": 0.3,  # Lower to capture quiet speech/sounds
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),  # Temperature fallback for better coverage
-            "prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
-        }
-        if return_timestamps:
-            generate_kwargs["return_timestamps"] = "word"
-        # Transcribe with verbatim settings
-        result = pipe(audio_input, generate_kwargs=generate_kwargs)
-        # Format output
-        text = result["text"]
-        # Additional info
-        output = f"**Transcription:**\n{text}\n"
-        if return_timestamps and "chunks" in result:
             output += "\n**Word-level Timestamps:**\n"
-            for chunk in result["chunks"]:
-                start = chunk["timestamp"][0]
-                end = chunk["timestamp"][1]
-                if start is not None and end is not None:
-                    output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
         return output
     except Exception as e:
         return f"Error during transcription: {str(e)}"
 # Language options for manual selection
 LANGUAGES = {
@@ -148,6 +246,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - ✅ Preserves natural speech patterns
         - ✅ Word-level timestamps for precise alignment
         - ✅ Supports 99+ languages
         **Note:** This is optimized for verbatim transcription, capturing speech as naturally
         as possible including all disfluencies and non-lexical sounds.
@@ -189,7 +288,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 label="Verbatim Transcription",
                 lines=20,
                 show_copy_button=True,
-                placeholder="Your verbatim transcription will appear here..."
             )
     gr.Markdown(
@@ -203,6 +302,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - **Captures repetitions**: "I I I think that..."
         - **Includes non-words**: Attempts to phonetically transcribe sounds
         - **Lower thresholds**: Captures quieter speech and partial words
         ### Use Cases
         - Legal transcription requiring exact wording
@@ -211,19 +311,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         - Medical/therapeutic session transcripts
         - Interview transcription with speaker mannerisms
         - Research requiring disfluency analysis
         ### Tips for Best Results
         - Use clear audio with minimal background noise
         - Ensure consistent audio levels
         - For very noisy environments, pre-process audio
         - Specify language manually if auto-detect misidentifies
         """
     )
     # Set up event handler
-    def transcribe_wrapper(audio, task, timestamps, language_name):
         language_code = LANGUAGES[language_name]
-        return transcribe_audio(audio, task, timestamps, language_code)
     transcribe_btn.click(
         fn=transcribe_wrapper,

 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import spaces
 import numpy as np
+from pydub import AudioSegment
+import io
+import tempfile
+import os
 # Model configuration - Using Whisper with settings optimized for verbatim transcription
 MODEL_NAME = "openai/whisper-large-v3"
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    max_new_tokens=384,  # Reduced to account for prompt tokens
     chunk_length_s=30,
     batch_size=16,
     torch_dtype=torch_dtype,
     device=device,
 )
+def get_audio_duration(audio_path):
+    """Get duration of audio file in seconds."""
+    try:
+        audio = AudioSegment.from_file(audio_path)
+        return len(audio) / 1000.0  # Convert ms to seconds
+    except:
+        return None
+def slice_audio(audio_path, chunk_duration=300):
+    """
+    Slice audio into chunks of specified duration (in seconds).
+    Default is 5 minutes (300 seconds) per chunk.
+    """
+    audio = AudioSegment.from_file(audio_path)
+    duration_ms = len(audio)
+    chunk_duration_ms = chunk_duration * 1000
+    chunks = []
+    for i in range(0, duration_ms, chunk_duration_ms):
+        chunk = audio[i:i + chunk_duration_ms]
+        # Export chunk to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            chunk.export(temp_file.name, format="wav")
+            chunks.append(temp_file.name)
+    return chunks
 @spaces.GPU
+def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
+    """
+    Transcribe a single audio chunk with verbatim settings.
+    """
+    # Configure pipeline parameters for VERBATIM transcription
+    generate_kwargs = {
+        "task": task,
+        "language": language,
+        # Verbatim transcription settings
+        "condition_on_previous_text": True,
+        "compression_ratio_threshold": 1.35,
+        "logprob_threshold": -1.0,
+        "no_speech_threshold": 0.3,
+        "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+        # Shorter prompt to avoid token limit issues
+        "prompt": "Transcribe verbatim including um, uh, hesitations.",
+    }
+    # Transcribe with verbatim settings
+    result = pipe(audio_input, generate_kwargs=generate_kwargs)
+    return result
+def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
     """
     Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
+    Automatically slices long audio files and processes in batches.
     Args:
         audio: Audio input (file path or numpy array)
         task: Either "transcribe" or "translate" (to English)
         return_timestamps: Whether to return word-level timestamps
         language: Language code (None for auto-detect)
+        progress: Gradio progress tracker
     Returns:
         Verbatim transcription text and optional timestamp information
     if audio is None:
         return "Please provide an audio file or recording."
+    temp_files = []
     try:
         # Handle different audio input formats
         if isinstance(audio, str):
+            audio_path = audio
         elif isinstance(audio, tuple):
             # Gradio microphone input format: (sample_rate, audio_data)
             sr, audio_data = audio
+            # Save to temporary file for processing
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+                import scipy.io.wavfile
+                scipy.io.wavfile.write(temp_file.name, sr, audio_data)
+                audio_path = temp_file.name
+                temp_files.append(audio_path)
         else:
+            return "Unsupported audio format."
+        # Check audio duration and slice if necessary
+        duration = get_audio_duration(audio_path)
+        chunk_duration = 300  # 5 minutes per chunk
+        if duration and duration > chunk_duration:
+            progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
+            audio_chunks = slice_audio(audio_path, chunk_duration)
+            temp_files.extend(audio_chunks)
+        else:
+            audio_chunks = [audio_path]
+        # Process each chunk
+        all_transcriptions = []
+        total_chunks = len(audio_chunks)
+        for idx, chunk_path in enumerate(audio_chunks):
+            progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
+            result = transcribe_audio_chunk(chunk_path, task, language)
+            if return_timestamps and "chunks" in result:
+                # Add chunk offset to timestamps
+                chunk_offset = idx * chunk_duration
+                chunk_text = result["text"]
+                timestamp_text = []
+                for word_chunk in result["chunks"]:
+                    start = word_chunk["timestamp"][0]
+                    end = word_chunk["timestamp"][1]
+                    if start is not None and end is not None:
+                        timestamp_text.append({
+                            "start": start + chunk_offset,
+                            "end": end + chunk_offset,
+                            "text": word_chunk["text"]
+                        })
+                all_transcriptions.append({
+                    "text": chunk_text,
+                    "timestamps": timestamp_text
+                })
+            else:
+                all_transcriptions.append({
+                    "text": result["text"],
+                    "timestamps": []
+                })
+        # Combine all transcriptions
+        full_text = " ".join([t["text"] for t in all_transcriptions])
+        output = f"**Transcription:**\n{full_text}\n"
+        if return_timestamps:
             output += "\n**Word-level Timestamps:**\n"
+            for trans in all_transcriptions:
+                for ts in trans["timestamps"]:
+                    output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n"
+        if duration:
+            output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
         return output
     except Exception as e:
         return f"Error during transcription: {str(e)}"
+    finally:
+        # Clean up temporary files
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except:
+                pass
 # Language options for manual selection
 LANGUAGES = {
         - ✅ Preserves natural speech patterns
         - ✅ Word-level timestamps for precise alignment
         - ✅ Supports 99+ languages
+        - ✅ **Automatic chunking for long audio files** (processes in 5-minute segments)
         **Note:** This is optimized for verbatim transcription, capturing speech as naturally
         as possible including all disfluencies and non-lexical sounds.
                 label="Verbatim Transcription",
                 lines=20,
                 show_copy_button=True,
+                placeholder="Your verbatim transcription will appear here...\n\nLong audio files will be automatically processed in chunks."
             )
     gr.Markdown(
         - **Captures repetitions**: "I I I think that..."
         - **Includes non-words**: Attempts to phonetically transcribe sounds
         - **Lower thresholds**: Captures quieter speech and partial words
+        - **Handles long audio**: Automatically slices files longer than 5 minutes
         ### Use Cases
         - Legal transcription requiring exact wording
         - Medical/therapeutic session transcripts
         - Interview transcription with speaker mannerisms
         - Research requiring disfluency analysis
+        - Podcast and long-form content transcription
         ### Tips for Best Results
         - Use clear audio with minimal background noise
         - Ensure consistent audio levels
         - For very noisy environments, pre-process audio
         - Specify language manually if auto-detect misidentifies
+        - Long files are automatically chunked (no length limit!)
         """
     )
     # Set up event handler
+    def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
         language_code = LANGUAGES[language_name]
+        return transcribe_audio(audio, task, timestamps, language_code, progress)
     transcribe_btn.click(
         fn=transcribe_wrapper,