rafaaa2105 commited on
Commit
9d3319a
Β·
verified Β·
1 Parent(s): 4a11f6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -36
app.py CHANGED
@@ -3,6 +3,10 @@ import torch
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import spaces
5
  import numpy as np
 
 
 
 
6
 
7
  # Model configuration - Using Whisper with settings optimized for verbatim transcription
8
  MODEL_NAME = "openai/whisper-large-v3"
@@ -26,24 +30,75 @@ pipe = pipeline(
26
  model=model,
27
  tokenizer=processor.tokenizer,
28
  feature_extractor=processor.feature_extractor,
29
- max_new_tokens=448, # Increased for verbatim transcription
30
  chunk_length_s=30,
31
  batch_size=16,
32
  torch_dtype=torch_dtype,
33
  device=device,
34
  )
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @spaces.GPU
37
- def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
  Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
40
- Configured to capture hesitations, fillers, non-words, and all spoken sounds.
41
 
42
  Args:
43
  audio: Audio input (file path or numpy array)
44
  task: Either "transcribe" or "translate" (to English)
45
  return_timestamps: Whether to return word-level timestamps
46
  language: Language code (None for auto-detect)
 
47
 
48
  Returns:
49
  Verbatim transcription text and optional timestamp information
@@ -51,54 +106,97 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
51
  if audio is None:
52
  return "Please provide an audio file or recording."
53
 
 
 
54
  try:
55
  # Handle different audio input formats
56
  if isinstance(audio, str):
57
- audio_input = audio
58
  elif isinstance(audio, tuple):
59
  # Gradio microphone input format: (sample_rate, audio_data)
60
  sr, audio_data = audio
61
- audio_input = {"array": audio_data.astype(np.float32), "sampling_rate": sr}
 
 
 
 
 
62
  else:
63
- audio_input = audio
64
-
65
- # Configure pipeline parameters for VERBATIM transcription
66
- generate_kwargs = {
67
- "task": task,
68
- "language": language,
69
- # Verbatim transcription settings
70
- "condition_on_previous_text": True, # Better context for non-words
71
- "compression_ratio_threshold": 1.35, # Lower threshold to keep more content
72
- "logprob_threshold": -1.0, # Keep lower probability tokens (hesitations, fillers)
73
- "no_speech_threshold": 0.3, # Lower to capture quiet speech/sounds
74
- "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # Temperature fallback for better coverage
75
- "prompt": "Transcribe everything verbatim, including um, uh, ah, filler words, hesitations, repetitions, false starts, and non-standard words.",
76
- }
77
 
78
- if return_timestamps:
79
- generate_kwargs["return_timestamps"] = "word"
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # Transcribe with verbatim settings
82
- result = pipe(audio_input, generate_kwargs=generate_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # Format output
85
- text = result["text"]
86
 
87
- # Additional info
88
- output = f"**Transcription:**\n{text}\n"
89
 
90
- if return_timestamps and "chunks" in result:
91
  output += "\n**Word-level Timestamps:**\n"
92
- for chunk in result["chunks"]:
93
- start = chunk["timestamp"][0]
94
- end = chunk["timestamp"][1]
95
- if start is not None and end is not None:
96
- output += f"[{start:.2f}s - {end:.2f}s] {chunk['text']}\n"
 
97
 
98
  return output
99
 
100
  except Exception as e:
101
  return f"Error during transcription: {str(e)}"
 
 
 
 
 
 
 
 
 
102
 
103
  # Language options for manual selection
104
  LANGUAGES = {
@@ -148,6 +246,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
148
  - βœ… Preserves natural speech patterns
149
  - βœ… Word-level timestamps for precise alignment
150
  - βœ… Supports 99+ languages
 
151
 
152
  **Note:** This is optimized for verbatim transcription, capturing speech as naturally
153
  as possible including all disfluencies and non-lexical sounds.
@@ -189,7 +288,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
189
  label="Verbatim Transcription",
190
  lines=20,
191
  show_copy_button=True,
192
- placeholder="Your verbatim transcription will appear here..."
193
  )
194
 
195
  gr.Markdown(
@@ -203,6 +302,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
203
  - **Captures repetitions**: "I I I think that..."
204
  - **Includes non-words**: Attempts to phonetically transcribe sounds
205
  - **Lower thresholds**: Captures quieter speech and partial words
 
206
 
207
  ### Use Cases
208
  - Legal transcription requiring exact wording
@@ -211,19 +311,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
211
  - Medical/therapeutic session transcripts
212
  - Interview transcription with speaker mannerisms
213
  - Research requiring disfluency analysis
 
214
 
215
  ### Tips for Best Results
216
  - Use clear audio with minimal background noise
217
  - Ensure consistent audio levels
218
  - For very noisy environments, pre-process audio
219
  - Specify language manually if auto-detect misidentifies
 
220
  """
221
  )
222
 
223
  # Set up event handler
224
- def transcribe_wrapper(audio, task, timestamps, language_name):
225
  language_code = LANGUAGES[language_name]
226
- return transcribe_audio(audio, task, timestamps, language_code)
227
 
228
  transcribe_btn.click(
229
  fn=transcribe_wrapper,
 
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import spaces
5
  import numpy as np
6
+ from pydub import AudioSegment
7
+ import io
8
+ import tempfile
9
+ import os
10
 
11
  # Model configuration - Using Whisper with settings optimized for verbatim transcription
12
  MODEL_NAME = "openai/whisper-large-v3"
 
30
  model=model,
31
  tokenizer=processor.tokenizer,
32
  feature_extractor=processor.feature_extractor,
33
+ max_new_tokens=384, # Reduced to account for prompt tokens
34
  chunk_length_s=30,
35
  batch_size=16,
36
  torch_dtype=torch_dtype,
37
  device=device,
38
  )
39
 
40
+ def get_audio_duration(audio_path):
41
+ """Get duration of audio file in seconds."""
42
+ try:
43
+ audio = AudioSegment.from_file(audio_path)
44
+ return len(audio) / 1000.0 # Convert ms to seconds
45
+ except:
46
+ return None
47
+
48
+ def slice_audio(audio_path, chunk_duration=300):
49
+ """
50
+ Slice audio into chunks of specified duration (in seconds).
51
+ Default is 5 minutes (300 seconds) per chunk.
52
+ """
53
+ audio = AudioSegment.from_file(audio_path)
54
+ duration_ms = len(audio)
55
+ chunk_duration_ms = chunk_duration * 1000
56
+
57
+ chunks = []
58
+ for i in range(0, duration_ms, chunk_duration_ms):
59
+ chunk = audio[i:i + chunk_duration_ms]
60
+
61
+ # Export chunk to temporary file
62
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
63
+ chunk.export(temp_file.name, format="wav")
64
+ chunks.append(temp_file.name)
65
+
66
+ return chunks
67
+
68
  @spaces.GPU
69
+ def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
70
+ """
71
+ Transcribe a single audio chunk with verbatim settings.
72
+ """
73
+ # Configure pipeline parameters for VERBATIM transcription
74
+ generate_kwargs = {
75
+ "task": task,
76
+ "language": language,
77
+ # Verbatim transcription settings
78
+ "condition_on_previous_text": True,
79
+ "compression_ratio_threshold": 1.35,
80
+ "logprob_threshold": -1.0,
81
+ "no_speech_threshold": 0.3,
82
+ "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
83
+ # Shorter prompt to avoid token limit issues
84
+ "prompt": "Transcribe verbatim including um, uh, hesitations.",
85
+ }
86
+
87
+ # Transcribe with verbatim settings
88
+ result = pipe(audio_input, generate_kwargs=generate_kwargs)
89
+ return result
90
+
91
+ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
92
  """
93
  Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
94
+ Automatically slices long audio files and processes in batches.
95
 
96
  Args:
97
  audio: Audio input (file path or numpy array)
98
  task: Either "transcribe" or "translate" (to English)
99
  return_timestamps: Whether to return word-level timestamps
100
  language: Language code (None for auto-detect)
101
+ progress: Gradio progress tracker
102
 
103
  Returns:
104
  Verbatim transcription text and optional timestamp information
 
106
  if audio is None:
107
  return "Please provide an audio file or recording."
108
 
109
+ temp_files = []
110
+
111
  try:
112
  # Handle different audio input formats
113
  if isinstance(audio, str):
114
+ audio_path = audio
115
  elif isinstance(audio, tuple):
116
  # Gradio microphone input format: (sample_rate, audio_data)
117
  sr, audio_data = audio
118
+ # Save to temporary file for processing
119
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
120
+ import scipy.io.wavfile
121
+ scipy.io.wavfile.write(temp_file.name, sr, audio_data)
122
+ audio_path = temp_file.name
123
+ temp_files.append(audio_path)
124
  else:
125
+ return "Unsupported audio format."
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Check audio duration and slice if necessary
128
+ duration = get_audio_duration(audio_path)
129
+ chunk_duration = 300 # 5 minutes per chunk
130
+
131
+ if duration and duration > chunk_duration:
132
+ progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
133
+ audio_chunks = slice_audio(audio_path, chunk_duration)
134
+ temp_files.extend(audio_chunks)
135
+ else:
136
+ audio_chunks = [audio_path]
137
+
138
+ # Process each chunk
139
+ all_transcriptions = []
140
+ total_chunks = len(audio_chunks)
141
 
142
+ for idx, chunk_path in enumerate(audio_chunks):
143
+ progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
144
+
145
+ result = transcribe_audio_chunk(chunk_path, task, language)
146
+
147
+ if return_timestamps and "chunks" in result:
148
+ # Add chunk offset to timestamps
149
+ chunk_offset = idx * chunk_duration
150
+ chunk_text = result["text"]
151
+ timestamp_text = []
152
+
153
+ for word_chunk in result["chunks"]:
154
+ start = word_chunk["timestamp"][0]
155
+ end = word_chunk["timestamp"][1]
156
+ if start is not None and end is not None:
157
+ timestamp_text.append({
158
+ "start": start + chunk_offset,
159
+ "end": end + chunk_offset,
160
+ "text": word_chunk["text"]
161
+ })
162
+
163
+ all_transcriptions.append({
164
+ "text": chunk_text,
165
+ "timestamps": timestamp_text
166
+ })
167
+ else:
168
+ all_transcriptions.append({
169
+ "text": result["text"],
170
+ "timestamps": []
171
+ })
172
 
173
+ # Combine all transcriptions
174
+ full_text = " ".join([t["text"] for t in all_transcriptions])
175
 
176
+ output = f"**Transcription:**\n{full_text}\n"
 
177
 
178
+ if return_timestamps:
179
  output += "\n**Word-level Timestamps:**\n"
180
+ for trans in all_transcriptions:
181
+ for ts in trans["timestamps"]:
182
+ output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n"
183
+
184
+ if duration:
185
+ output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
186
 
187
  return output
188
 
189
  except Exception as e:
190
  return f"Error during transcription: {str(e)}"
191
+
192
+ finally:
193
+ # Clean up temporary files
194
+ for temp_file in temp_files:
195
+ try:
196
+ if os.path.exists(temp_file):
197
+ os.unlink(temp_file)
198
+ except:
199
+ pass
200
 
201
  # Language options for manual selection
202
  LANGUAGES = {
 
246
  - βœ… Preserves natural speech patterns
247
  - βœ… Word-level timestamps for precise alignment
248
  - βœ… Supports 99+ languages
249
+ - βœ… **Automatic chunking for long audio files** (processes in 5-minute segments)
250
 
251
  **Note:** This is optimized for verbatim transcription, capturing speech as naturally
252
  as possible including all disfluencies and non-lexical sounds.
 
288
  label="Verbatim Transcription",
289
  lines=20,
290
  show_copy_button=True,
291
+ placeholder="Your verbatim transcription will appear here...\n\nLong audio files will be automatically processed in chunks."
292
  )
293
 
294
  gr.Markdown(
 
302
  - **Captures repetitions**: "I I I think that..."
303
  - **Includes non-words**: Attempts to phonetically transcribe sounds
304
  - **Lower thresholds**: Captures quieter speech and partial words
305
+ - **Handles long audio**: Automatically slices files longer than 5 minutes
306
 
307
  ### Use Cases
308
  - Legal transcription requiring exact wording
 
311
  - Medical/therapeutic session transcripts
312
  - Interview transcription with speaker mannerisms
313
  - Research requiring disfluency analysis
314
+ - Podcast and long-form content transcription
315
 
316
  ### Tips for Best Results
317
  - Use clear audio with minimal background noise
318
  - Ensure consistent audio levels
319
  - For very noisy environments, pre-process audio
320
  - Specify language manually if auto-detect misidentifies
321
+ - Long files are automatically chunked (no length limit!)
322
  """
323
  )
324
 
325
  # Set up event handler
326
+ def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
327
  language_code = LANGUAGES[language_name]
328
+ return transcribe_audio(audio, task, timestamps, language_code, progress)
329
 
330
  transcribe_btn.click(
331
  fn=transcribe_wrapper,