speech-to-dutch-translation

Build error

App Files Files Community

susnato commited on Jul 21, 2023

Commit

cb99941

1 Parent(s): e6e41df

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -35

app.py CHANGED Viewed

@@ -1,51 +1,47 @@
 import torch
 import numpy as np
 import gradio as gr
-from transformers import AutoProcessor, AutoModel, pipeline, MarianMTModel, MarianTokenizer
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
-# load text-to-speech checkpoint and speaker embeddings
-processor = AutoProcessor.from_pretrained("suno/bark-small")
-model = AutoModel.from_pretrained("suno/bark-small").to(device)
-# load MartianMT model for translating English to Hindi.
-martian_mt_model = MarianMTModel.from_pretrained("AbhirupGhosh/opus-mt-finetuned-en-hi")
-martian_mt_tokenizer = MarianTokenizer.from_pretrained("AbhirupGhosh/opus-mt-finetuned-en-hi")
-def translate_english_to_hindi(english_text):
-    tokenized_text = martian_mt_tokenizer.encode(english_text, return_tensors="pt")
-    generated_token_ids = martian_mt_model.generate(tokenized_text, use_cache=True, max_new_tokens=128)
-    hindi_text = martian_mt_tokenizer.decode(generated_token_ids.numpy()[0])
-    hindi_text = hindi_text.replace("</s>", "")
-    hindi_text = hindi_text.replace("<pad>", "")
-    return hindi_text
-def translate_to_english(audio):
-    outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe", "use_cache":"True", "max_new_tokens":128})
     return outputs["text"]
 def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt").to(device)
-    speech_values = model.generate(**inputs, use_cache=True, max_new_tokens=128)
-    speech_values = speech_values.cpu().numpy()
-    return speech_values
-def speech_to_hindi_translation(audio):
-    english_text = translate_to_english(audio)
-    hindi_text = translate_english_to_hindi(english_text)
-    synthesised_speech = synthesise(hindi_text)[0]
-    synthesised_speech = (synthesised_speech * 32767).astype(np.int16)
-    return 22050, synthesised_speech
 title = "Speech-To-Speech-Translation for Hindi"
@@ -56,7 +52,7 @@ description = """
 demo = gr.Blocks()
 mic_translate = gr.Interface(
-    fn=speech_to_hindi_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
@@ -64,7 +60,7 @@ mic_translate = gr.Interface(
 )
 file_translate = gr.Interface(
-    fn=speech_to_hindi_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     # examples=["./example.wav"]],
@@ -75,4 +71,4 @@ file_translate = gr.Interface(
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch(debug=True)

 import torch
 import numpy as np
 import gradio as gr
+from transformers import AutoProcessor, SpeechT5ForTextToSpeech, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5HifiGan
+from datasets import load_dataset
+device = "cpu"
 # load speech translation checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
+# load text-to-speech checkpoint
+tts_processor = AutoProcessor.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl")
+tts_model = SpeechT5ForTextToSpeech.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl").to(device)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+# load speaker embeddings
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+def transcribe(audio):
+    outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe",
+                                               "language":"nl",
+                                               "use_cache":True,
+                                               "max_new_tokens":128})
     return outputs["text"]
 def synthesise(text):
+    inputs = tts_processor(text=text,
+                           truncation=True,
+                           return_tensors="pt")
+    speech = tts_model.generate_speech(inputs["input_ids"].to(device),
+                                       speaker_embeddings.to(device),
+                                       vocoder=vocoder,
+                                       )
+    return speech.cpu().numpy()
+def speech_to_dutch_translation(audio):
+    dutch_text = transcribe(audio)
+    speech = synthesise(dutch_text)
+    speech = (speech * 32767).astype(np.int16)
+    return 16_000, speech
 title = "Speech-To-Speech-Translation for Hindi"
 demo = gr.Blocks()
 mic_translate = gr.Interface(
+    fn=speech_to_dutch_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
 )
 file_translate = gr.Interface(
+    fn=speech_to_dutch_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
     # examples=["./example.wav"]],
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch(debug=False)