File size: 8,321 Bytes
ac96ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np

# Model configuration - Using Whisper with settings optimized for verbatim transcription
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

# Create pipeline with verbatim-optimized settings
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=448,  # Increased for verbatim transcription
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
    """
    Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
    Configured to capture hesitations, fillers, non-words, and all spoken sounds.
    
    Args:
        audio: Audio input (file path or numpy array)
        task: Either "transcribe" or "translate" (to English)
        return_timestamps: Whether to return word-level timestamps
        language: Language code (None for auto-detect)
    
    Returns:
        Verbatim transcription text and optional timestamp information
    """
    if audio is None:
        return "Please provide an audio file or recording."
    
    try:
        # Handle different audio input formats
        if isinstance(audio, str):
            audio_input = audio
        elif isinstance(audio, tuple):
            # Gradio microphone input format: (sample_rate, audio_data)
            sr, audio_data = audio
            audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
        else:
            audio_input = audio
        
        # Configure pipeline parameters for VERBATIM transcription
        generate_kwargs = {
            "task": task,
            "language": language,
            # Verbatim transcription settings
            "condition_on_previous_text": True,  # Better context for non-words
            "compression_ratio_threshold": 1.35,  # Lower threshold to keep more content
            "logprob_threshold": -1.0,  # Keep lower probability tokens (hesitations, fillers)
            "no_speech_threshold": 0.3,  # Lower to capture quiet speech/sounds
            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),  # Temperature fallback for better coverage
            "prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
        }
        
        if return_timestamps:
            generate_kwargs["return_timestamps"] = "word"
        
        # Transcribe with verbatim settings
        result = pipe(audio_input, generate_kwargs=generate_kwargs)
        
        # Format output
        text = result["text"]
        
        # Additional info
        output = f"**Transcription:**\n{text}\n"
        
        if return_timestamps and "chunks" in result:
            output += "\n**Word-level Timestamps:**\n"
            for chunk in result["chunks"]:
                start = chunk["timestamp"][0]
                end = chunk["timestamp"][1]
                if start is not None and end is not None:
                    output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
        
        return output
        
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Language options for manual selection
LANGUAGES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Turkish": "tr",
    "Polish": "pl",
    "Ukrainian": "uk",
    "Vietnamese": "vi",
    "Thai": "th",
    "Indonesian": "id",
    "Czech": "cs",
    "Romanian": "ro",
    "Swedish": "sv",
    "Danish": "da",
    "Norwegian": "no",
    "Finnish": "fi",
    "Greek": "el",
    "Hebrew": "he",
}

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # πŸŽ™οΈ Very Verbatim Multilingual Speech-to-Text
        
        Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
        
        **Verbatim Transcription Features:**
        - βœ… Captures hesitations (um, uh, ah, eh)
        - βœ… Transcribes filler words and false starts
        - βœ… Includes repetitions and stutters
        - βœ… Attempts to transcribe non-standard words and sounds
        - βœ… Preserves natural speech patterns
        - βœ… Word-level timestamps for precise alignment
        - βœ… Supports 99+ languages
        
        **Note:** This is optimized for verbatim transcription, capturing speech as naturally 
        as possible including all disfluencies and non-lexical sounds.
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Audio Input"
            )
            
            with gr.Row():
                task_radio = gr.Radio(
                    choices=["transcribe", "translate"],
                    value="transcribe",
                    label="Task",
                    info="Transcribe in original language or translate to English"
                )
                
                language_dropdown = gr.Dropdown(
                    choices=list(LANGUAGES.keys()),
                    value="Auto-detect",
                    label="Language",
                    info="Select language or use auto-detect"
                )
            
            timestamps_checkbox = gr.Checkbox(
                label="Return word-level timestamps",
                value=False
            )
            
            transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Textbox(
                label="Verbatim Transcription",
                lines=20,
                show_copy_button=True,
                placeholder="Your verbatim transcription will appear here..."
            )
    
    gr.Markdown(
        """
        ### What Makes This "Very Verbatim"?
        
        Unlike standard transcription that cleans up speech, this configuration:
        - **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
        - **Preserves fillers**: "like", "you know", "I mean"
        - **Shows false starts**: "I was- I went to the store"
        - **Captures repetitions**: "I I I think that..."
        - **Includes non-words**: Attempts to phonetically transcribe sounds
        - **Lower thresholds**: Captures quieter speech and partial words
        
        ### Use Cases
        - Legal transcription requiring exact wording
        - Linguistic analysis of natural speech
        - Conversational AI training data
        - Medical/therapeutic session transcripts
        - Interview transcription with speaker mannerisms
        - Research requiring disfluency analysis
        
        ### Tips for Best Results
        - Use clear audio with minimal background noise
        - Ensure consistent audio levels
        - For very noisy environments, pre-process audio
        - Specify language manually if auto-detect misidentifies
        """
    )
    
    # Set up event handler
    def transcribe_wrapper(audio, task, timestamps, language_name):
        language_code = LANGUAGES[language_name]
        return transcribe_audio(audio, task, timestamps, language_code)
    
    transcribe_btn.click(
        fn=transcribe_wrapper,
        inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
        outputs=output_text
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()