rafaaa2105 commited on
Commit
f9600ca
Β·
verified Β·
1 Parent(s): f0d997e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -67
app.py CHANGED
@@ -4,15 +4,17 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import spaces
5
  import numpy as np
6
  from pydub import AudioSegment
7
- import io
8
  import tempfile
9
  import os
10
 
11
- # Model configuration - Using Whisper with settings optimized for verbatim transcription
12
- MODEL_NAME = "openai/whisper-large-v3"
 
13
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
15
 
 
 
16
  # Load model and processor
17
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
18
  MODEL_NAME,
@@ -24,25 +26,27 @@ model.to(device)
24
 
25
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
26
 
27
- # Create pipeline with verbatim-optimized settings
28
  pipe = pipeline(
29
  "automatic-speech-recognition",
30
  model=model,
31
  tokenizer=processor.tokenizer,
32
  feature_extractor=processor.feature_extractor,
33
- max_new_tokens=384,
34
  chunk_length_s=30,
35
  batch_size=16,
36
- return_timestamps=True,
37
  torch_dtype=torch_dtype,
38
  device=device,
39
  )
40
 
 
 
41
  def get_audio_duration(audio_path):
42
  """Get duration of audio file in seconds."""
43
  try:
44
  audio = AudioSegment.from_file(audio_path)
45
- return len(audio) / 1000.0 # Convert ms to seconds
46
  except:
47
  return None
48
 
@@ -59,7 +63,6 @@ def slice_audio(audio_path, chunk_duration=300):
59
  for i in range(0, duration_ms, chunk_duration_ms):
60
  chunk = audio[i:i + chunk_duration_ms]
61
 
62
- # Export chunk to temporary file
63
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
64
  chunk.export(temp_file.name, format="wav")
65
  chunks.append(temp_file.name)
@@ -67,36 +70,31 @@ def slice_audio(audio_path, chunk_duration=300):
67
  return chunks
68
 
69
  @spaces.GPU
70
- def transcribe_audio_chunk(audio_input, task="transcribe", language=None, return_timestamps=False):
71
  """
72
- Transcribe a single audio chunk with verbatim settings.
 
73
  """
74
- # Configure pipeline parameters for VERBATIM transcription
75
  generate_kwargs = {
76
  "task": task,
77
- "language": language,
78
- # Verbatim transcription settings - only use supported parameters
79
- "return_timestamps": "word" if return_timestamps else False,
80
  }
81
 
82
- # Transcribe with verbatim settings
 
 
 
83
  result = pipe(audio_input, generate_kwargs=generate_kwargs)
84
  return result
85
 
86
  def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
87
  """
88
- Transcribe audio with very verbatim output using Whisper model with ZeroGPU.
89
- Automatically slices long audio files and processes in batches.
90
-
91
- Args:
92
- audio: Audio input (file path or numpy array)
93
- task: Either "transcribe" or "translate" (to English)
94
- return_timestamps: Whether to return word-level timestamps
95
- language: Language code (None for auto-detect)
96
- progress: Gradio progress tracker
97
-
98
- Returns:
99
- Verbatim transcription text and optional timestamp information
100
  """
101
  if audio is None:
102
  return "Please provide an audio file or recording."
@@ -108,9 +106,7 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
108
  if isinstance(audio, str):
109
  audio_path = audio
110
  elif isinstance(audio, tuple):
111
- # Gradio microphone input format: (sample_rate, audio_data)
112
  sr, audio_data = audio
113
- # Save to temporary file for processing
114
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
115
  import scipy.io.wavfile
116
  scipy.io.wavfile.write(temp_file.name, sr, audio_data)
@@ -137,10 +133,9 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
137
  for idx, chunk_path in enumerate(audio_chunks):
138
  progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
139
 
140
- result = transcribe_audio_chunk(chunk_path, task, language, return_timestamps)
141
 
142
  if return_timestamps and "chunks" in result:
143
- # Add chunk offset to timestamps
144
  chunk_offset = idx * chunk_duration
145
  chunk_text = result["text"]
146
  timestamp_text = []
@@ -168,7 +163,7 @@ def transcribe_audio(audio, task="transcribe", return_timestamps=False, language
168
  # Combine all transcriptions
169
  full_text = " ".join([t["text"] for t in all_transcriptions])
170
 
171
- output = f"**Transcription:**\n{full_text}\n"
172
 
173
  if return_timestamps:
174
  output += "\n**Word-level Timestamps:**\n"
@@ -231,18 +226,23 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
231
  """
232
  # πŸŽ™οΈ Very Verbatim Multilingual Speech-to-Text
233
 
234
- Powered by OpenAI Whisper Large V3 with ZeroGPU acceleration.
 
 
 
 
235
 
236
- **Verbatim Transcription Features:**
237
- - βœ… Captures hesitations (um, uh, ah, eh)
238
- - βœ… Transcribes filler words and false starts
239
- - βœ… Includes repetitions and stutters
240
- - βœ… Preserves natural speech patterns with word-level timestamps
241
- - βœ… Supports 99+ languages
242
- - βœ… **Automatic chunking for long audio files** (processes in 5-minute segments)
 
243
 
244
- **Note:** Whisper Large V3 naturally captures disfluencies when using word-level timestamps.
245
- The model transcribes speech as naturally as possible including hesitations and non-lexical sounds.
246
  """
247
  )
248
 
@@ -259,7 +259,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
259
  choices=["transcribe", "translate"],
260
  value="transcribe",
261
  label="Task",
262
- info="Transcribe in original language or translate to English"
263
  )
264
 
265
  language_dropdown = gr.Dropdown(
@@ -270,47 +270,62 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
270
  )
271
 
272
  timestamps_checkbox = gr.Checkbox(
273
- label="Return word-level timestamps",
274
- value=False
 
275
  )
276
 
277
  transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
278
 
279
  with gr.Column():
280
  output_text = gr.Textbox(
281
- label="Verbatim Transcription",
282
  lines=20,
283
  show_copy_button=True,
284
- placeholder="Your verbatim transcription will appear here...\n\nLong audio files will be automatically processed in chunks."
285
  )
286
 
287
  gr.Markdown(
288
  """
289
- ### What Makes This "Very Verbatim"?
290
 
291
- Whisper Large V3 with word-level timestamps naturally provides verbatim transcription:
292
- - **Keeps hesitations**: "um", "uh", "ah", "er", "mm"
293
- - **Preserves fillers**: "like", "you know", "I mean"
294
- - **Shows false starts**: "I was- I went to the store"
295
- - **Captures repetitions**: "I I I think that..."
296
- - **Word-level precision**: Exact timestamps for every word
297
- - **Handles long audio**: Automatically slices files longer than 5 minutes
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  ### Use Cases
300
- - Legal transcription requiring exact wording
301
- - Linguistic analysis of natural speech
302
- - Conversational AI training data
303
- - Medical/therapeutic session transcripts
304
- - Interview transcription with speaker mannerisms
305
- - Research requiring disfluency analysis
306
- - Podcast and long-form content transcription
 
307
 
308
  ### Tips for Best Results
309
- - Use clear audio with minimal background noise
310
- - Ensure consistent audio levels
311
- - For very noisy environments, pre-process audio
312
- - Specify language manually if auto-detect misidentifies
313
- - Long files are automatically chunked (no length limit!)
 
314
  """
315
  )
316
 
 
4
  import spaces
5
  import numpy as np
6
  from pydub import AudioSegment
 
7
  import tempfile
8
  import os
9
 
10
+ # Model configuration - Using CrisperWhisper for TRUE verbatim transcription
11
+ # CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
12
+ MODEL_NAME = "nyrahealth/CrisperWhisper"
13
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
15
 
16
+ print(f"Loading {MODEL_NAME} for verbatim transcription...")
17
+
18
  # Load model and processor
19
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
20
  MODEL_NAME,
 
26
 
27
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
28
 
29
+ # Create pipeline optimized for verbatim output
30
  pipe = pipeline(
31
  "automatic-speech-recognition",
32
  model=model,
33
  tokenizer=processor.tokenizer,
34
  feature_extractor=processor.feature_extractor,
35
+ max_new_tokens=448,
36
  chunk_length_s=30,
37
  batch_size=16,
38
+ return_timestamps="word", # CrisperWhisper provides accurate word-level timestamps
39
  torch_dtype=torch_dtype,
40
  device=device,
41
  )
42
 
43
+ print("Model loaded successfully!")
44
+
45
  def get_audio_duration(audio_path):
46
  """Get duration of audio file in seconds."""
47
  try:
48
  audio = AudioSegment.from_file(audio_path)
49
+ return len(audio) / 1000.0
50
  except:
51
  return None
52
 
 
63
  for i in range(0, duration_ms, chunk_duration_ms):
64
  chunk = audio[i:i + chunk_duration_ms]
65
 
 
66
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
67
  chunk.export(temp_file.name, format="wav")
68
  chunks.append(temp_file.name)
 
70
  return chunks
71
 
72
  @spaces.GPU
73
+ def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
74
  """
75
+ Transcribe a single audio chunk with CrisperWhisper.
76
+ This model is specifically trained for verbatim transcription.
77
  """
 
78
  generate_kwargs = {
79
  "task": task,
 
 
 
80
  }
81
 
82
+ if language:
83
+ generate_kwargs["language"] = language
84
+
85
+ # CrisperWhisper automatically provides verbatim transcription
86
  result = pipe(audio_input, generate_kwargs=generate_kwargs)
87
  return result
88
 
89
  def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
90
  """
91
+ Transcribe audio with VERY VERBATIM output using CrisperWhisper.
92
+ CrisperWhisper transcribes every spoken word exactly as it is, including:
93
+ - Fillers (um, uh, ah, er, mm)
94
+ - Pauses and hesitations
95
+ - Stutters and repetitions
96
+ - False starts
97
+ - Non-standard utterances
 
 
 
 
 
98
  """
99
  if audio is None:
100
  return "Please provide an audio file or recording."
 
106
  if isinstance(audio, str):
107
  audio_path = audio
108
  elif isinstance(audio, tuple):
 
109
  sr, audio_data = audio
 
110
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
111
  import scipy.io.wavfile
112
  scipy.io.wavfile.write(temp_file.name, sr, audio_data)
 
133
  for idx, chunk_path in enumerate(audio_chunks):
134
  progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
135
 
136
+ result = transcribe_audio_chunk(chunk_path, task, language)
137
 
138
  if return_timestamps and "chunks" in result:
 
139
  chunk_offset = idx * chunk_duration
140
  chunk_text = result["text"]
141
  timestamp_text = []
 
163
  # Combine all transcriptions
164
  full_text = " ".join([t["text"] for t in all_transcriptions])
165
 
166
+ output = f"**Verbatim Transcription:**\n{full_text}\n"
167
 
168
  if return_timestamps:
169
  output += "\n**Word-level Timestamps:**\n"
 
226
  """
227
  # πŸŽ™οΈ Very Verbatim Multilingual Speech-to-Text
228
 
229
+ Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration.
230
+
231
+ ## πŸ”₯ TRUE Verbatim Transcription
232
+
233
+ Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**:
234
 
235
+ - βœ… **Fillers**: um, uh, ah, er, mm, like, you know
236
+ - βœ… **Hesitations**: pauses, breath sounds, stutters
237
+ - βœ… **False Starts**: "I was- I went to the store"
238
+ - βœ… **Repetitions**: "I I I think that..."
239
+ - βœ… **Disfluencies**: Every non-fluent speech element
240
+ - βœ… **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
241
+ - βœ… **Multilingual**: Supports 99+ languages
242
+ - βœ… **Long Audio Support**: Automatic 5-minute chunking
243
 
244
+ **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews,
245
+ conversational AI training, or any use case requiring exact speech capture.
246
  """
247
  )
248
 
 
259
  choices=["transcribe", "translate"],
260
  value="transcribe",
261
  label="Task",
262
+ info="Transcribe verbatim or translate to English"
263
  )
264
 
265
  language_dropdown = gr.Dropdown(
 
270
  )
271
 
272
  timestamps_checkbox = gr.Checkbox(
273
+ label="Show word-level timestamps",
274
+ value=True,
275
+ info="Display precise timing for each word"
276
  )
277
 
278
  transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
279
 
280
  with gr.Column():
281
  output_text = gr.Textbox(
282
+ label="Verbatim Transcription (includes all um, uh, hesitations)",
283
  lines=20,
284
  show_copy_button=True,
285
+ placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
286
  )
287
 
288
  gr.Markdown(
289
  """
290
+ ### Why CrisperWhisper for Verbatim?
291
 
292
+ **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
293
+ - ❌ Removes "um", "uh", "ah"
294
+ - ❌ Omits false starts
295
+ - ❌ Skips repetitions
296
+ - ❌ Ignores stutters
297
+
298
+ **CrisperWhisper** is specifically trained for verbatim transcription:
299
+ - βœ… Keeps every filler word
300
+ - βœ… Preserves all disfluencies
301
+ - βœ… Captures exact speech patterns
302
+ - βœ… Accurate timestamps around hesitations
303
+
304
+ ### Example Comparison
305
+
306
+ **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
307
+
308
+ **Standard Whisper:** "So I was thinking that we could go to the store"
309
+
310
+ **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
311
 
312
  ### Use Cases
313
+
314
+ - **Legal/Court Transcription**: Exact wording required by law
315
+ - **Linguistic Research**: Study of natural speech patterns and disfluencies
316
+ - **Medical/Therapy Sessions**: Capturing patient speech patterns
317
+ - **Interview Transcription**: Preserving speaker mannerisms
318
+ - **Conversational AI Training**: Realistic dialogue data
319
+ - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing
320
+ - **Language Learning**: Analyzing natural spoken language
321
 
322
  ### Tips for Best Results
323
+
324
+ - Clear audio with minimal background noise works best
325
+ - The model captures quiet speech - ensure consistent audio levels
326
+ - Manual language selection can improve accuracy
327
+ - Long files are automatically processed in 5-minute chunks
328
+ - Timestamps help identify exact moments of hesitations
329
  """
330
  )
331