Spaces:
Build error
Build error
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| from transformers import AutoProcessor, SpeechT5ForTextToSpeech, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5HifiGan | |
| from datasets import load_dataset | |
| device = "cpu" | |
| # load speech translation checkpoint | |
| asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device) | |
| # load text-to-speech checkpoint | |
| tts_processor = AutoProcessor.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl") | |
| tts_model = SpeechT5ForTextToSpeech.from_pretrained("susnato/speecht5_finetuned_voxpopuli_nl").to(device) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
| # load speaker embeddings | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| def transcribe(audio): | |
| outputs = asr_pipe(audio, generate_kwargs={"task": "transcribe", | |
| "language":"nl", | |
| "use_cache":True, | |
| "max_new_tokens":128}) | |
| return outputs["text"] | |
| def synthesise(text): | |
| inputs = tts_processor(text=text, | |
| truncation=True, | |
| return_tensors="pt") | |
| speech = tts_model.generate_speech(inputs["input_ids"].to(device), | |
| speaker_embeddings.to(device), | |
| vocoder=vocoder, | |
| ) | |
| return speech.cpu().numpy() | |
| def speech_to_dutch_translation(audio): | |
| dutch_text = transcribe(audio) | |
| speech = synthesise(dutch_text) | |
| speech = (speech * 32767).astype(np.int16) | |
| return 16_000, speech | |
| title = "Speech-To-Speech-Translation for Hindi" | |
| description = """ | |
|  | |
| """ | |
| demo = gr.Blocks() | |
| mic_translate = gr.Interface( | |
| fn=speech_to_dutch_translation, | |
| inputs=gr.Audio(source="microphone", type="filepath"), | |
| outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
| title=title, | |
| description=description, | |
| ) | |
| file_translate = gr.Interface( | |
| fn=speech_to_dutch_translation, | |
| inputs=gr.Audio(source="upload", type="filepath"), | |
| outputs=gr.Audio(label="Generated Speech", type="numpy"), | |
| # examples=["./example.wav"]], | |
| title=title, | |
| description=description, | |
| ) | |
| with demo: | |
| gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) | |
| demo.launch(debug=False) |