Spaces:

rafaaa2105
/

Crisper-Whisper

Runtime error

File size: 8,321 Bytes

ac96ee7

import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np

# Model configuration - Using Whisper with settings optimized for verbatim transcription
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

# Create pipeline with verbatim-optimized settings
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=448,  # Increased for verbatim transcription
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
    """
    Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
    Configured to capture hesitations, fillers, non-words, and all spoken sounds.
    
    Args:
        audio: Audio input (file path or numpy array)
        task: Either "transcribe" or "translate" (to English)
        return_timestamps: Whether to return word-level timestamps
        language: Language code (None for auto-detect)
    
    Returns:
        Verbatim transcription text and optional timestamp information
    """
    if audio is None:
        return "Please provide an audio file or recording."
    
    try:
        # Handle different audio input formats
        if isinstance(audio, str):
            audio_input = audio
        elif isinstance(audio, tuple):
            # Gradio microphone input format: (sample_rate, audio_data)
            sr, audio_data = audio
            audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
        else:
            audio_input = audio
        
        # Configure pipeline parameters for VERBATIM transcription
        generate_kwargs = {
            "task": task,
            "language": language,
            # Verbatim transcription settings
            "condition_on_previous_text": True,  # Better context for non-words
            "compression_ratio_threshold": 1.35,  # Lower threshold to keep more content
            "logprob_threshold": -1.0,  # Keep lower probability tokens (hesitations, fillers)
            "no_speech_threshold": 0.3,  # Lower to capture quiet speech/sounds
            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),  # Temperature fallback for better coverage
            "prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
        }
        
        if return_timestamps:
            generate_kwargs["return_timestamps"] = "word"
        
        # Transcribe with verbatim settings
        result = pipe(audio_input, generate_kwargs=generate_kwargs)
        
        # Format output
        text = result["text"]
        
        # Additional info
        output = f"**Transcription:**\n{text}\n"
        
        if return_timestamps and "chunks" in result:
            output += "\n**Word-level Timestamps:**\n"
            for chunk in result["chunks"]:
                start = chunk["timestamp"][0]
                end = chunk["timestamp"][1]
                if start is not None and end is not None:
                    output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
        
        return output
        
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Language options for manual selection
LANGUAGES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Turkish": "tr",
    "Polish": "pl",
    "Ukrainian": "uk",
    "Vietnamese": "vi",
    "Thai": "th",
    "Indonesian": "id",
    "Czech": "cs",
    "Romanian": "ro",
    "Swedish": "sv",
    "Danish": "da",
    "Norwegian": "no",
    "Finnish": "fi",
    "Greek": "el",
    "Hebrew": "he",
}

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎙️ Very Verbatim Multilingual Speech-to-Text
        
        Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
        
        **Verbatim Transcription Features:**
        - ✅ Captures hesitations (um, uh, ah, eh)
        - ✅ Transcribes filler words and false starts
        - ✅ Includes repetitions and stutters
        - ✅ Attempts to transcribe non-standard words and sounds
        - ✅ Preserves natural speech patterns
        - ✅ Word-level timestamps for precise alignment
        - ✅ Supports 99+ languages
        
        **Note:** This is optimized for verbatim transcription, capturing speech as naturally 
        as possible including all disfluencies and non-lexical sounds.
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Audio Input"
            )
            
            with gr.Row():
                task_radio = gr.Radio(
                    choices=["transcribe", "translate"],
                    value="transcribe",
                    label="Task",
                    info="Transcribe in original language or translate to English"
                )
                
                language_dropdown = gr.Dropdown(
                    choices=list(LANGUAGES.keys()),
                    value="Auto-detect",
                    label="Language",
                    info="Select language or use auto-detect"
                )
            
            timestamps_checkbox = gr.Checkbox(
                label="Return word-level timestamps",
                value=False
            )
            
            transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Textbox(
                label="Verbatim Transcription",
                lines=20,
                show_copy_button=True,
                placeholder="Your verbatim transcription will appear here..."
            )
    
    gr.Markdown(
        """
        ### What Makes This "Very Verbatim"?
        
        Unlike standard transcription that cleans up speech, this configuration:
        - **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
        - **Preserves fillers**: "like", "you know", "I mean"
        - **Shows false starts**: "I was- I went to the store"
        - **Captures repetitions**: "I I I think that..."
        - **Includes non-words**: Attempts to phonetically transcribe sounds
        - **Lower thresholds**: Captures quieter speech and partial words
        
        ### Use Cases
        - Legal transcription requiring exact wording
        - Linguistic analysis of natural speech
        - Conversational AI training data
        - Medical/therapeutic session transcripts
        - Interview transcription with speaker mannerisms
        - Research requiring disfluency analysis
        
        ### Tips for Best Results
        - Use clear audio with minimal background noise
        - Ensure consistent audio levels
        - For very noisy environments, pre-process audio
        - Specify language manually if auto-detect misidentifies
        """
    )
    
    # Set up event handler
    def transcribe_wrapper(audio, task, timestamps, language_name):
        language_code = LANGUAGES[language_name]
        return transcribe_audio(audio, task, timestamps, language_code)
    
    transcribe_btn.click(
        fn=transcribe_wrapper,
        inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
        outputs=output_text
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()