File size: 6,854 Bytes
203c275
 
 
 
 
 
54a498a
 
4b9bf7f
 
 
 
 
 
cfe84f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eac21bd
203c275
 
 
 
3146cac
9ca12d8
3146cac
c6a2465
 
 
 
 
203c275
 
 
 
c6a2465
203c275
 
 
 
 
9ca12d8
203c275
 
 
 
 
 
 
98d8864
203c275
 
 
9ca12d8
203c275
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch
import gradio as gr
import torchaudio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers.models.speecht5 import SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("nambn0321/T5_US_Accent_1")
model = SpeechT5ForTextToSpeech.from_pretrained("nambn0321/T5_US_Accent_1", use_safetensors=True, trust_remote_code=True)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
vocoder = vocoder.to(device)

speaker_embedding = torch.tensor([[-0.0743, -0.0103,  0.0260,  0.0237,  0.0045, -0.0173, -0.0721, -0.0579,
          0.0374,  0.0206, -0.0648, -0.0665,  0.0259,  0.0414,  0.0323,  0.0512,
         -0.0078,  0.0259,  0.0123,  0.0155,  0.0371,  0.0255, -0.0156, -0.0398,
         -0.0612, -0.0098, -0.0582, -0.0046,  0.0377,  0.0320, -0.0028,  0.0450,
          0.0136, -0.0471,  0.0584, -0.0672,  0.0124,  0.0591, -0.0767, -0.0775,
          0.0142, -0.0590,  0.0407,  0.0436,  0.0238, -0.1164, -0.0200,  0.0116,
         -0.0551,  0.0721,  0.0228,  0.0490,  0.0465,  0.0149, -0.0871, -0.0100,
          0.0324,  0.0294,  0.0441,  0.0122,  0.0189, -0.0091, -0.0154,  0.0116,
          0.0376,  0.0224,  0.0141, -0.0388, -0.0615, -0.0467,  0.0216,  0.0115,
          0.0205,  0.0136,  0.0419,  0.0258,  0.0181,  0.0173, -0.0678, -0.0821,
         -0.0862, -0.0480, -0.0566, -0.0387, -0.0345, -0.0636, -0.0453,  0.0519,
          0.0190,  0.0681,  0.0282, -0.0694, -0.0032, -0.0608,  0.0649, -0.0070,
          0.0200,  0.0304, -0.0486, -0.0640,  0.0396, -0.1017, -0.0794,  0.0478,
          0.0425, -0.0547,  0.0486,  0.0480,  0.0169,  0.0227, -0.0807,  0.0313,
          0.0611, -0.0058,  0.0498,  0.0242, -0.0534, -0.0267, -0.0341, -0.0348,
          0.0220, -0.0662,  0.0370,  0.0365, -0.0660,  0.0279, -0.0644,  0.0143,
          0.0326,  0.0500,  0.0300,  0.0072,  0.0336,  0.0345,  0.0276, -0.0646,
         -0.0484, -0.0059, -0.0605,  0.0012,  0.0081,  0.0036, -0.0033,  0.0463,
         -0.0506,  0.0270, -0.0066, -0.0609,  0.0493, -0.1155,  0.0447, -0.0371,
         -0.0567,  0.0285,  0.0146,  0.0203,  0.0108, -0.0639, -0.0762,  0.0279,
          0.0205,  0.0018,  0.0158, -0.0595, -0.0299,  0.0084,  0.0270,  0.0379,
          0.0132,  0.0510,  0.0261, -0.0636,  0.0276, -0.0498,  0.0167,  0.0027,
         -0.0372,  0.0067,  0.0527, -0.0707,  0.0391, -0.0644,  0.0172,  0.0347,
         -0.0643, -0.0093,  0.0371,  0.0346, -0.0542, -0.0589, -0.0141,  0.0344,
         -0.0659,  0.0478,  0.0131,  0.0165,  0.0172,  0.0042,  0.0322, -0.0516,
          0.0523,  0.0285, -0.0554,  0.0056, -0.0021,  0.0150,  0.0391, -0.0400,
          0.0248, -0.0332,  0.0047, -0.0792, -0.0429,  0.0398, -0.0565,  0.0409,
          0.0457, -0.0870,  0.0314,  0.0226, -0.0816,  0.0377, -0.0779, -0.0134,
          0.0412,  0.0425,  0.0585,  0.0799,  0.0527,  0.0279, -0.0557,  0.0240,
          0.0306, -0.0370,  0.0098, -0.0225,  0.0299,  0.0527, -0.0011,  0.0456,
         -0.0768,  0.0237, -0.0966,  0.0106, -0.0521,  0.0512,  0.0424,  0.0236,
          0.0301,  0.0044,  0.0502,  0.0307,  0.0095, -0.0570,  0.0166,  0.0166,
          0.0321,  0.0367, -0.0677,  0.0514,  0.0165, -0.0601,  0.0407,  0.0401,
          0.0020,  0.0015,  0.0574,  0.0310, -0.0053, -0.0610,  0.0391, -0.0212,
          0.0271, -0.0256, -0.0613,  0.0301,  0.0564,  0.0209, -0.0815, -0.0544,
         -0.0091,  0.0303,  0.0256, -0.0597, -0.0593,  0.0376,  0.0184,  0.0580,
         -0.1039,  0.0021,  0.0159,  0.0319, -0.0386,  0.0322, -0.0432,  0.0292,
          0.0096,  0.0047,  0.0127,  0.0264,  0.0627,  0.0366,  0.0212, -0.0772,
          0.0303,  0.0400,  0.0267,  0.0290,  0.0309,  0.0488,  0.0430,  0.0153,
         -0.0187,  0.0440, -0.0995, -0.0837, -0.0254,  0.0274, -0.0638,  0.0500,
         -0.0568,  0.0611, -0.0643,  0.0084,  0.0148, -0.0675,  0.0311,  0.0652,
         -0.0648, -0.0791, -0.0660,  0.0231,  0.0096,  0.0477, -0.0702,  0.0503,
          0.0446, -0.0523, -0.0305, -0.0593,  0.0238, -0.0557,  0.0130,  0.0067,
         -0.0756,  0.0354,  0.0289,  0.0261, -0.0466, -0.0584, -0.0441, -0.0355,
         -0.0699,  0.1035,  0.0268, -0.0459, -0.0062,  0.0283, -0.0462,  0.0247,
         -0.1061,  0.0222, -0.0052,  0.0058,  0.0479, -0.0126, -0.0533,  0.0160,
          0.0042,  0.0476,  0.0133,  0.0263, -0.0822,  0.0167, -0.0129, -0.0026,
          0.0359,  0.0130,  0.0528,  0.0397, -0.0638, -0.1078,  0.0214,  0.0292,
          0.0351, -0.0545,  0.0406, -0.0787,  0.0306,  0.0389,  0.0332,  0.0178,
         -0.0405,  0.0238,  0.0087,  0.0140,  0.0397, -0.0856, -0.0334, -0.0002,
         -0.0025, -0.0352,  0.0299, -0.0384,  0.0179,  0.0057,  0.0005, -0.0593,
         -0.0505, -0.0592, -0.0831,  0.0174,  0.0417, -0.0128,  0.0286, -0.0422,
         -0.0141, -0.0779, -0.1574, -0.0493, -0.0533, -0.0075,  0.0274, -0.0474,
         -0.0516,  0.0257,  0.0360,  0.0330,  0.0212, -0.0346, -0.0637, -0.0165,
         -0.0254,  0.0295,  0.0180,  0.0093,  0.0260,  0.0096, -0.0626, -0.0537,
          0.0172,  0.0479,  0.0311,  0.0023,  0.0482,  0.0456,  0.0232,  0.0089,
         -0.0030, -0.0109,  0.0400,  0.0059,  0.0046,  0.0122,  0.0007, -0.0109,
          0.0188, -0.0746, -0.0615,  0.0463, -0.0136,  0.0101,  0.0435,  0.0257,
         -0.0516,  0.0282,  0.0218, -0.0788,  0.0135,  0.0192, -0.0027,  0.0225,
          0.0103,  0.0045, -0.0529, -0.0672,  0.0158, -0.0058,  0.0440,  0.0572,
         -0.0373, -0.0386,  0.0256,  0.0211,  0.0453,  0.0515, -0.0624,  0.0371,
         -0.0205, -0.0121, -0.0542,  0.0136, -0.0411,  0.0284,  0.0219, -0.0009,
         -0.0469, -0.0276, -0.0797, -0.0664,  0.0094,  0.0443, -0.0661,  0.0388,
         -0.0244, -0.0143, -0.0674,  0.0379,  0.0583, -0.0234,  0.0413, -0.0651]]).to(device)

def tts_generate(text):
    try:
        # Preprocess input
        inputs = processor(text=text, return_tensors="pt").to(device)

        # Generate waveform directly with vocoder
        with torch.no_grad():
            waveform = model.generate_speech(
                inputs["input_ids"],
                speaker_embedding,
                vocoder=vocoder
            )

        # Save waveform
        output_path = "output.wav"
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)

        return output_path

    except Exception as e:
        print("Error during TTS generation:", e)
        return "Error during speech synthesis."

demo = gr.Interface(
    fn=tts_generate,
    inputs=gr.Textbox(label="Enter text"),
    outputs=gr.Audio(label="Generated Speech", type="filepath"),
    title="SpeechT5 Text-to-Speech",
    description="Enter text and hear it with my custom SpeechT5"
)

if __name__ == "__main__":
    print("Launching Gradio demo")
    demo.launch()