Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,134 +5,134 @@ import torchaudio
|
|
| 5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 6 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 7 |
|
| 8 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 9 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
| 10 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 11 |
-
|
| 12 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
-
model = model.to(device)
|
| 14 |
-
vocoder = vocoder.to(device)
|
| 15 |
-
|
| 16 |
-
speaker_embedding = torch.zeros(1, 512).to(device)
|
| 17 |
-
|
| 18 |
-
# Load model and processor
|
| 19 |
-
# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_british_2nd_attempt")
|
| 20 |
-
# model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 21 |
-
# "nambn0321/TTS_british_2nd_attempt",
|
| 22 |
-
# use_safetensors=True,
|
| 23 |
-
# trust_remote_code=True
|
| 24 |
-
# )
|
| 25 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 26 |
|
| 27 |
-
# # Move to CUDA if available
|
| 28 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
# model = model.to(device)
|
| 30 |
# vocoder = vocoder.to(device)
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
#
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
def tts_generate(text):
|
| 138 |
print(f"📝 Input text: {text}")
|
|
|
|
| 5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 6 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 7 |
|
| 8 |
+
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 9 |
+
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 11 |
|
|
|
|
| 12 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
# model = model.to(device)
|
| 14 |
# vocoder = vocoder.to(device)
|
| 15 |
|
| 16 |
+
# speaker_embedding = torch.zeros(1, 512).to(device)
|
| 17 |
+
|
| 18 |
+
# Load model and processor
|
| 19 |
+
processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_british_2nd_attempt")
|
| 20 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 21 |
+
"nambn0321/TTS_british_2nd_attempt",
|
| 22 |
+
use_safetensors=True,
|
| 23 |
+
trust_remote_code=True
|
| 24 |
+
)
|
| 25 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 26 |
+
|
| 27 |
+
# Move to CUDA if available
|
| 28 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
+
model = model.to(device)
|
| 30 |
+
vocoder = vocoder.to(device)
|
| 31 |
+
|
| 32 |
+
# # Dummy speaker embedding (or load your real one here)
|
| 33 |
+
speaker_embedding = torch.tensor([[-8.4146e-02, -9.6771e-03, 2.6844e-02, 2.2544e-02, -1.8455e-03,
|
| 34 |
+
-2.5492e-02, -6.2846e-02, -6.6044e-02, 3.1312e-02, 1.9042e-02,
|
| 35 |
+
-7.3039e-02, -8.0007e-02, 4.7080e-02, 3.7199e-02, 3.8119e-02,
|
| 36 |
+
5.2214e-02, 1.1470e-02, 3.5968e-02, 8.1480e-03, 1.9952e-02,
|
| 37 |
+
3.5186e-02, 1.3260e-02, -1.4271e-02, -3.8798e-02, -6.8658e-02,
|
| 38 |
+
-1.0121e-02, -5.8041e-02, 7.0087e-03, 2.4180e-02, 2.5093e-02,
|
| 39 |
+
-2.9513e-03, 4.1835e-02, 3.4997e-02, -4.7138e-02, 5.1652e-02,
|
| 40 |
+
-6.1358e-02, 8.9115e-03, 6.5912e-02, -6.1133e-02, -6.3178e-02,
|
| 41 |
+
2.6463e-02, -6.4097e-02, 2.5865e-02, 3.6212e-02, 1.1775e-02,
|
| 42 |
+
-1.1237e-01, -1.9422e-02, 1.1771e-02, -5.1472e-02, 6.1495e-02,
|
| 43 |
+
1.8622e-02, 3.6634e-02, 4.0088e-02, 2.4596e-02, -8.8999e-02,
|
| 44 |
+
-7.0578e-03, 4.2418e-02, 3.0962e-02, 3.6715e-02, 9.8362e-03,
|
| 45 |
+
2.2486e-02, -8.9123e-03, -1.5486e-02, 9.6940e-03, 2.6662e-02,
|
| 46 |
+
3.0941e-02, 9.7256e-03, -3.7924e-02, -6.1841e-02, -5.8947e-02,
|
| 47 |
+
1.9625e-02, 1.0872e-02, 2.3126e-02, 9.8068e-03, 4.3716e-02,
|
| 48 |
+
2.3031e-02, 1.6826e-02, 1.5088e-02, -7.1926e-02, -9.1217e-02,
|
| 49 |
+
-7.4439e-02, -3.6920e-02, -6.2430e-02, -5.1998e-02, -3.4543e-02,
|
| 50 |
+
-7.5438e-02, -4.9039e-02, 6.2118e-02, 1.7150e-02, 5.1786e-02,
|
| 51 |
+
2.8916e-02, -7.0776e-02, 4.1283e-03, -7.0833e-02, 7.7827e-02,
|
| 52 |
+
-1.3024e-03, 2.8799e-02, 2.4766e-02, -5.6856e-02, -4.9455e-02,
|
| 53 |
+
4.2941e-02, -6.8559e-02, -8.7198e-02, 5.3497e-02, 3.1263e-02,
|
| 54 |
+
-4.4196e-02, 4.5291e-02, 3.5016e-02, 2.7069e-02, 1.5925e-02,
|
| 55 |
+
-6.9111e-02, 2.8218e-02, 6.5694e-02, -2.9196e-03, 5.4257e-02,
|
| 56 |
+
3.1966e-02, -5.5521e-02, -2.9675e-02, -5.7311e-02, -2.5660e-02,
|
| 57 |
+
2.5544e-02, -6.7957e-02, 3.2566e-02, 5.1615e-02, -6.6451e-02,
|
| 58 |
+
4.0387e-02, -5.5676e-02, 6.8813e-03, 4.2905e-02, 5.4461e-02,
|
| 59 |
+
4.0467e-02, 9.1374e-03, 1.8873e-02, 4.0569e-02, 2.6208e-02,
|
| 60 |
+
-5.7306e-02, -4.2936e-02, 3.0970e-02, -5.9009e-02, -2.8001e-03,
|
| 61 |
+
4.3110e-03, 3.9585e-03, -1.6110e-02, 3.8766e-02, -5.6695e-02,
|
| 62 |
+
3.8432e-02, -8.8127e-03, -4.8471e-02, 6.2207e-02, -1.0577e-01,
|
| 63 |
+
5.1350e-02, -3.8907e-02, -5.3652e-02, 3.5648e-02, 1.5506e-02,
|
| 64 |
+
1.6576e-02, 5.0061e-03, -6.9767e-02, -4.8234e-02, 5.2357e-02,
|
| 65 |
+
2.8605e-02, 3.0886e-03, 1.0158e-02, -5.5175e-02, -2.0156e-02,
|
| 66 |
+
8.5321e-03, 1.9991e-02, 3.8980e-02, 7.9981e-03, 4.3244e-02,
|
| 67 |
+
1.0584e-02, -8.0995e-02, 3.8089e-02, -3.5556e-02, -2.6731e-03,
|
| 68 |
+
7.6711e-03, -4.6591e-02, 1.0168e-02, 2.5794e-02, -6.5743e-02,
|
| 69 |
+
4.1506e-02, -5.1859e-02, 2.2370e-02, 2.5183e-02, -5.7133e-02,
|
| 70 |
+
-9.8171e-03, 4.1331e-02, 4.1121e-02, -3.7192e-02, -6.7827e-02,
|
| 71 |
+
-6.6048e-03, -5.5967e-03, -6.1243e-02, 1.8434e-02, 1.7639e-02,
|
| 72 |
+
3.2567e-02, 4.3591e-02, 1.1444e-04, 2.1000e-02, -3.3589e-02,
|
| 73 |
+
6.3952e-02, 3.2268e-02, -4.5664e-02, 6.0440e-03, 5.8870e-03,
|
| 74 |
+
1.3482e-02, 3.1084e-02, -4.7126e-02, 4.0494e-02, -3.1627e-02,
|
| 75 |
+
4.4450e-03, -8.1786e-02, -3.9848e-02, 5.6240e-02, -4.5132e-02,
|
| 76 |
+
4.4740e-02, 5.3827e-02, -9.0746e-02, 2.2419e-02, 1.4306e-02,
|
| 77 |
+
-9.1269e-02, 2.7554e-02, -6.4524e-02, -2.8340e-02, 4.7677e-02,
|
| 78 |
+
4.6020e-02, 6.4275e-02, 8.6387e-02, 6.4071e-02, 2.6549e-02,
|
| 79 |
+
-4.1220e-02, 2.2935e-02, 4.3711e-02, -4.0866e-02, -8.3878e-04,
|
| 80 |
+
-2.0668e-02, 4.3676e-02, 1.9026e-02, -5.8537e-04, 3.7715e-02,
|
| 81 |
+
-6.8229e-02, 2.5884e-02, -8.7095e-02, 9.5215e-03, -3.3552e-02,
|
| 82 |
+
4.5210e-02, 4.0977e-02, 7.1891e-03, 4.1980e-02, 3.2498e-02,
|
| 83 |
+
4.2608e-02, 3.9835e-02, 9.1516e-03, -4.8366e-02, 1.4572e-02,
|
| 84 |
+
2.0647e-02, 3.6124e-02, 4.6580e-02, -7.7210e-02, 5.7704e-02,
|
| 85 |
+
1.1030e-03, -5.8918e-02, 2.9356e-02, 3.4516e-02, 1.2991e-03,
|
| 86 |
+
-1.1909e-02, 4.8185e-02, 3.0447e-02, 4.7657e-03, -4.7582e-02,
|
| 87 |
+
3.2026e-02, -1.8162e-02, 1.0278e-02, -3.2285e-02, -6.1007e-02,
|
| 88 |
+
2.3500e-02, 3.6015e-02, 1.5098e-02, -6.6183e-02, -4.5165e-02,
|
| 89 |
+
-2.3547e-03, 2.5537e-02, 3.6404e-02, -7.5865e-02, -6.8920e-02,
|
| 90 |
+
4.4063e-02, 2.2352e-02, 5.3221e-02, -1.1117e-01, 1.6892e-03,
|
| 91 |
+
1.5308e-02, 2.4001e-02, -3.9773e-02, 3.4531e-02, -4.2249e-02,
|
| 92 |
+
3.2091e-02, 1.8124e-02, -8.6720e-04, 1.1351e-02, 1.1819e-02,
|
| 93 |
+
6.4638e-02, 2.8144e-02, 2.3477e-02, -6.5364e-02, 9.0942e-03,
|
| 94 |
+
4.2016e-02, 1.3313e-02, 3.1852e-02, 4.7584e-02, 5.6397e-02,
|
| 95 |
+
2.4306e-02, 2.4805e-02, -3.0570e-02, 5.5471e-02, -9.8115e-02,
|
| 96 |
+
-7.9501e-02, -2.4848e-02, 2.5320e-02, -6.8306e-02, 5.7311e-02,
|
| 97 |
+
-4.5671e-02, 5.5003e-02, -4.7802e-02, 1.1989e-02, 1.5714e-02,
|
| 98 |
+
-5.9569e-02, 2.2274e-02, 6.2575e-02, -6.7901e-02, -1.0904e-01,
|
| 99 |
+
-5.9256e-02, 3.2378e-02, 2.7536e-03, 4.5289e-02, -7.6812e-02,
|
| 100 |
+
5.6575e-02, 4.3113e-02, -4.5169e-02, -1.9948e-02, -6.8110e-02,
|
| 101 |
+
3.7796e-02, -5.1603e-02, 1.7293e-02, 3.9350e-04, -9.2822e-02,
|
| 102 |
+
2.9600e-02, 1.8698e-02, 2.5137e-02, -4.4556e-02, -5.5333e-02,
|
| 103 |
+
-3.0218e-02, -3.8053e-02, -4.6037e-02, 9.3173e-02, 3.0859e-02,
|
| 104 |
+
-4.5461e-02, -6.4138e-03, 3.9415e-02, -6.0368e-02, 2.1082e-02,
|
| 105 |
+
-1.0852e-01, 3.1828e-02, -6.7432e-03, 2.7266e-02, 4.0655e-02,
|
| 106 |
+
-1.2353e-02, -5.5131e-02, 8.4986e-03, 7.7413e-03, 3.8210e-02,
|
| 107 |
+
1.1482e-02, 4.4694e-02, -7.8239e-02, 2.7954e-02, -1.4465e-02,
|
| 108 |
+
-2.1566e-03, 4.4880e-02, 1.1331e-02, 4.6676e-02, 4.9100e-02,
|
| 109 |
+
-8.2094e-02, -1.0947e-01, 2.4079e-02, 3.3945e-02, 1.6590e-02,
|
| 110 |
+
-7.0771e-02, 4.7745e-02, -6.1351e-02, 3.1324e-02, 4.3053e-02,
|
| 111 |
+
3.6573e-02, 1.3540e-02, -4.3418e-02, 2.7820e-02, 1.8301e-02,
|
| 112 |
+
2.2243e-02, 2.3911e-02, -1.0137e-01, -1.4640e-02, 6.4367e-03,
|
| 113 |
+
2.2727e-03, -5.7253e-02, 9.6726e-03, -2.1960e-02, 7.6282e-03,
|
| 114 |
+
6.4420e-03, 5.0940e-03, -6.7982e-02, -3.8508e-02, -6.1518e-02,
|
| 115 |
+
-8.3228e-02, 2.2245e-02, 4.1082e-02, -1.2621e-02, 1.4017e-02,
|
| 116 |
+
-3.7110e-02, 2.4853e-03, -7.3434e-02, -1.5334e-01, -4.2328e-02,
|
| 117 |
+
-3.7094e-02, 5.9061e-03, 2.3480e-02, -4.8760e-02, -5.4049e-02,
|
| 118 |
+
2.4981e-02, 3.9116e-02, 1.9743e-02, 3.5136e-02, -3.8281e-02,
|
| 119 |
+
-4.0026e-02, -1.6648e-02, -2.5015e-02, 4.2738e-02, 3.4282e-02,
|
| 120 |
+
2.5185e-03, 3.5040e-02, 3.0350e-02, -6.4696e-02, -5.7804e-02,
|
| 121 |
+
3.7872e-02, 4.9861e-02, 2.4467e-02, 2.2676e-03, 3.6383e-02,
|
| 122 |
+
4.5123e-02, 2.6333e-02, 8.5096e-03, -6.5880e-03, -1.3365e-02,
|
| 123 |
+
3.4063e-02, 6.1048e-03, 8.9429e-03, 1.9870e-02, -7.0909e-03,
|
| 124 |
+
4.1589e-03, 2.4686e-02, -6.6816e-02, -7.1205e-02, 4.5023e-02,
|
| 125 |
+
-1.2020e-02, 1.7312e-02, 7.1944e-02, 2.5880e-02, -5.8930e-02,
|
| 126 |
+
2.1297e-02, 3.8620e-02, -8.1744e-02, 1.2381e-02, 2.5355e-02,
|
| 127 |
+
-1.8926e-03, 1.7969e-02, 1.8478e-02, 4.2966e-03, -3.4873e-02,
|
| 128 |
+
-6.8647e-02, 6.3062e-03, -5.6329e-03, 5.0244e-02, 7.9961e-02,
|
| 129 |
+
-4.6829e-02, -5.0599e-02, 2.6159e-02, 3.4554e-02, 4.3835e-02,
|
| 130 |
+
3.8469e-02, -4.2016e-02, 4.2099e-02, -1.9708e-02, -1.1056e-02,
|
| 131 |
+
-4.5883e-02, 1.9459e-02, -4.2683e-02, 2.8083e-02, 2.1005e-02,
|
| 132 |
+
-1.3023e-03, -4.5802e-02, -3.6830e-02, -7.2250e-02, -5.6689e-02,
|
| 133 |
+
8.9729e-03, 2.5722e-02, -6.7266e-02, 3.6313e-02, -4.3734e-02,
|
| 134 |
+
-1.4877e-02, -6.9325e-02, 4.3192e-02, 5.8052e-02, -1.5820e-02,
|
| 135 |
+
4.2151e-02, -6.9208e-02]]).to(device)
|
| 136 |
|
| 137 |
def tts_generate(text):
|
| 138 |
print(f"📝 Input text: {text}")
|