Spaces:

nambn0321
/

T5_First_US_Accent

Sleeping

App Files Files Community

nambn0321 commited on Sep 17

Commit

3146cac

verified ·

1 Parent(s): 8131e32

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -18

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import torch
 import gradio as gr
 import torchaudio
-# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
 from transformers.models.speecht5 import SpeechT5HifiGan
 # Load model and processor
 processor = SpeechT5Processor.from_pretrained("nambn0321/T5_british")
 model = SpeechT5ForTextToSpeech.from_pretrained(
@@ -20,35 +18,37 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
 vocoder = vocoder.to(device)
 def tts_generate(text):
-    print(f" Input text: {text}")
     try:
         # Preprocess input
-        print(" Processing input...")
         inputs = processor(text=text, return_tensors="pt").to(device)
-        print(" Text processed.")
-        # Generate waveform directly (with vocoder)
-        print("🎤 Generating speech waveform...")
         with torch.no_grad():
-            waveform = model.generate_speech(
-                inputs["input_ids"],
-                vocoder=vocoder
-            )
-        print(" Waveform generated.")
         # Save waveform
         output_path = "output.wav"
         if waveform.dim() == 1:
             waveform = waveform.unsqueeze(0)
         torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
-        print(f" Audio saved to {output_path}")
         return output_path
     except Exception as e:
-        print(" Error during TTS generation:", e)
         return "Error during speech synthesis."
 # Gradio interface
@@ -61,7 +61,5 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    print(" Launching Gradio demo...")
     demo.launch()

 import torch
 import gradio as gr
 import torchaudio
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
 from transformers.models.speecht5 import SpeechT5HifiGan
 # Load model and processor
 processor = SpeechT5Processor.from_pretrained("nambn0321/T5_british")
 model = SpeechT5ForTextToSpeech.from_pretrained(
 model = model.to(device)
 vocoder = vocoder.to(device)
 def tts_generate(text):
+    print(f"Input text: {text}")
     try:
         # Preprocess input
+        print("Processing input...")
         inputs = processor(text=text, return_tensors="pt").to(device)
+        print("Text processed.")
+        # Generate mel spectrogram with the TTS model (instead of using .generate_speech directly)
+        print("🎤 Generating mel spectrogram...")
+        with torch.no_grad():
+            mel_output, _ = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
+        print("Mel spectrogram generated.")
+        # Vocoder to generate waveform from mel spectrogram
+        print("🎤 Vocoding to waveform...")
         with torch.no_grad():
+            waveform = vocoder.decode(mel_output)
+        print("Waveform generated.")
         # Save waveform
         output_path = "output.wav"
         if waveform.dim() == 1:
             waveform = waveform.unsqueeze(0)
         torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
+        print(f"Audio saved to {output_path}")
         return output_path
     except Exception as e:
+        print("Error during TTS generation:", e)
         return "Error during speech synthesis."
 # Gradio interface
 )
 if __name__ == "__main__":
+    print("Launching Gradio demo...")
     demo.launch()