English (US) T5 v1.0
Collection
Text-To-Speech models that produce speeches/audio in English with an American accent
•
4 items
•
Updated
This model is a fine-tuned version of microsoft/speecht5_tts, trained on a dataset derived from audiobooks recorded by Karen Savage, which are in the public domain on LibriVox.org.
Here are some of the sample audio outputs from this model.
There is currently no standardized metric to evaluate TTS quality effectively.
Evaluation was done mainly through listening tests to determine naturalness.
Click here
10 as "ten"). import torch
import torchaudio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers.models.speecht5 import SpeechT5HifiGan
# Load processor, model, and vocoder
processor = SpeechT5Processor.from_pretrained("nambn0321/T5_US_accent_2")
model = SpeechT5ForTextToSpeech.from_pretrained("nambn0321/T5_US_accent_2", use_safetensors=True, trust_remote_code=True)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
vocoder = vocoder.to(device)
# For the sake of simplicity, use this predefined speaker embedding
speaker_embedding = torch.tensor([[-7.8568e-02, -4.2079e-03, 1.1993e-02, 1.2876e-02, 3.8205e-03,
-1.9735e-03, -6.8052e-02, -6.2425e-02, 4.2591e-02, 2.0495e-02,
-6.5605e-02, -7.4267e-02, 4.7141e-02, 3.1141e-02, 3.3795e-02,
6.8717e-02, 1.5437e-02, 2.9659e-02, 9.6837e-03, 1.6690e-02,
4.1287e-02, 1.0799e-02, -1.4346e-02, -3.6507e-02, -6.9912e-02,
-1.1495e-02, -5.9190e-02, 5.0997e-03, 3.5220e-02, 2.7239e-02,
-3.0035e-03, 4.0179e-02, 2.7811e-02, -3.7754e-02, 4.2270e-02,
-7.6790e-02, 3.3923e-02, 5.8342e-02, -6.8696e-02, -6.8298e-02,
-1.5029e-03, -5.7018e-02, -4.0267e-03, 5.2543e-02, 1.2046e-02,
-1.1127e-01, -1.9529e-02, 1.1586e-02, -7.0273e-02, 5.7403e-02,
1.9700e-02, 3.5813e-02, 3.8164e-02, 4.1581e-02, -7.9466e-02,
-4.0844e-03, 4.3121e-02, 2.5432e-02, 1.6693e-02, 1.4494e-02,
3.2961e-02, -1.0050e-02, -1.6570e-02, 2.1572e-02, 2.3886e-02,
3.7505e-02, 2.3737e-03, -3.5667e-02, -6.9384e-02, -6.1990e-02,
2.1427e-02, 1.0910e-02, -4.4866e-03, 1.9126e-02, 3.5026e-02,
2.6617e-02, 1.0270e-02, 1.7574e-02, -5.0846e-02, -7.9475e-02,
-5.9455e-02, -5.5634e-02, -5.4523e-02, -6.2594e-02, -3.4710e-02,
-4.8424e-02, -6.5559e-02, 4.3848e-02, -8.9867e-06, 5.7124e-02,
2.9633e-02, -8.8773e-02, 8.2799e-03, -6.3414e-02, 2.7484e-02,
6.6257e-03, 3.2360e-02, 3.4513e-02, -2.0671e-02, -8.1817e-02,
4.1832e-02, -6.9010e-02, -5.7109e-02, 5.1551e-02, 3.6937e-02,
-5.9055e-02, 2.5737e-02, 4.8279e-02, 4.0342e-02, 2.0409e-02,
-7.8760e-02, 4.8960e-02, 6.1605e-02, 1.5055e-03, 4.4753e-02,
5.1425e-02, -6.9668e-02, -3.3952e-02, -5.3081e-02, -3.3253e-02,
2.1449e-02, -7.3866e-02, 1.5239e-02, 3.7210e-02, -7.0857e-02,
4.2094e-02, -7.8425e-02, 2.2612e-02, 4.6070e-02, 3.1248e-02,
2.1681e-02, 9.0710e-03, 2.6234e-02, 3.9768e-02, 2.6416e-02,
-5.9739e-02, -5.3194e-02, 1.1592e-02, -7.3099e-02, -4.0911e-02,
2.9276e-02, 4.0793e-03, -2.7053e-02, 4.3887e-02, -7.4993e-02,
2.8244e-02, 1.4546e-02, -5.5933e-02, 5.4590e-02, -9.8596e-02,
2.3044e-02, -4.3384e-02, -6.2760e-02, 4.9645e-02, 1.9709e-02,
2.2457e-02, 1.0992e-02, -9.1083e-02, -7.2880e-02, 5.3015e-02,
1.4966e-02, 7.6749e-03, 1.2842e-02, -6.0044e-02, 1.4364e-03,
1.2117e-02, 3.7999e-02, 4.1830e-02, 1.7146e-02, 4.1624e-02,
1.9113e-02, -8.6394e-02, 3.9947e-02, -4.5318e-02, -1.5646e-02,
1.7320e-02, -5.8261e-02, 1.3057e-02, 1.7871e-02, -7.2801e-02,
2.7487e-02, -5.1378e-02, 1.0601e-02, 3.2772e-02, -3.3645e-02,
-9.6321e-03, 5.7508e-02, 3.8802e-02, -5.4275e-02, -6.4749e-02,
-2.3990e-02, 4.4422e-02, -5.5291e-02, 2.1329e-02, 3.5870e-02,
1.5788e-02, 1.9083e-02, -2.5848e-03, 3.0792e-02, -2.4433e-02,
4.0921e-02, 2.2340e-02, -4.7077e-02, 5.6612e-03, 2.4069e-02,
1.7687e-02, 5.2614e-02, -1.4121e-02, 4.4471e-02, -4.5358e-02,
3.0660e-03, -8.4165e-02, -4.3935e-02, 5.7635e-02, -4.6062e-02,
2.8475e-02, 2.7438e-02, -7.8207e-02, 3.6834e-02, 3.5305e-02,
-7.9270e-02, 1.5048e-02, -7.7217e-02, -3.3846e-02, 4.0682e-02,
4.5813e-02, 6.3953e-02, 8.8146e-02, 3.9316e-02, 3.6404e-02,
-3.6674e-02, 3.9037e-02, 3.2509e-02, -3.3039e-02, 9.0764e-03,
-1.9967e-02, 3.4478e-02, 2.2831e-02, -6.8772e-04, 5.4448e-02,
-6.7131e-02, 2.6475e-02, -9.6572e-02, 2.7054e-02, -6.1189e-02,
4.2293e-02, 5.5649e-02, 2.4348e-02, 6.6935e-03, 4.2651e-02,
3.7361e-02, 3.3392e-02, 9.3010e-03, -5.7520e-02, 5.3737e-03,
4.5707e-02, 2.8316e-02, -1.5346e-03, -6.4626e-02, 5.0692e-02,
1.4295e-02, -5.4578e-02, 3.8668e-02, 2.1647e-02, 1.4004e-03,
2.3282e-02, 3.1919e-02, 1.2071e-02, 1.3926e-02, -4.4616e-02,
4.2064e-02, -1.8788e-02, 1.6830e-02, -1.6330e-02, -6.7638e-02,
4.5764e-02, 1.6224e-02, 1.3495e-02, -7.7807e-02, -4.8269e-02,
-2.7209e-02, 5.7491e-02, 3.6628e-02, -8.6239e-02, -5.5271e-02,
3.9839e-02, 1.0211e-03, 5.5201e-02, -9.7384e-02, 3.8847e-03,
1.0693e-02, 7.5698e-03, -5.3666e-02, 4.1555e-02, -3.2620e-02,
3.2532e-02, 7.4491e-03, 3.6136e-02, 1.7120e-02, 2.5016e-02,
6.8792e-02, 2.9997e-02, 2.1673e-02, -7.8844e-02, 1.1353e-02,
3.5831e-02, 3.0084e-02, 3.0417e-02, 2.9927e-02, 2.1848e-02,
4.9556e-02, 2.2132e-02, -2.8324e-02, 4.4158e-02, -8.2102e-02,
-6.4570e-02, -2.4734e-02, 3.2701e-02, -7.0163e-02, 5.4873e-02,
-4.7028e-02, 4.4843e-02, -4.5314e-02, 1.0327e-02, 2.8297e-02,
-5.7504e-02, 4.7179e-02, 7.4731e-02, -6.5681e-02, -8.6343e-02,
-6.4412e-02, 3.1260e-02, 1.6076e-02, 4.7171e-02, -7.1781e-02,
4.2377e-02, 3.9755e-02, -3.6226e-02, -7.4231e-03, -6.4577e-02,
3.0569e-02, -5.3078e-02, 2.7852e-02, -7.6148e-03, -7.3751e-02,
2.0000e-02, 2.1321e-02, 1.5519e-02, -3.6516e-02, -5.5269e-02,
-4.3193e-02, -1.7178e-02, -5.1271e-02, 1.0353e-01, 4.1393e-02,
-4.7789e-02, -8.0428e-03, 2.9483e-02, -5.4314e-02, 1.0356e-02,
-1.0647e-01, 2.6810e-02, -1.3466e-02, -9.5602e-04, 5.6365e-02,
-3.4805e-02, -4.8433e-02, 5.5901e-03, 1.0095e-02, 4.4062e-02,
1.3886e-02, 2.7514e-02, -9.5484e-02, 1.4190e-02, -1.3233e-02,
-2.4893e-03, 2.6416e-02, 6.7407e-03, 6.1025e-02, 3.8437e-02,
-7.4136e-02, -1.1276e-01, 1.3998e-02, 4.5844e-02, 1.8342e-02,
-6.7303e-02, 2.9729e-02, -6.0356e-02, 3.4768e-02, 2.6196e-02,
5.8514e-03, 7.3593e-03, -4.2139e-02, 3.0210e-02, 1.5900e-02,
7.0803e-03, 3.3725e-02, -8.8192e-02, 1.3683e-03, 1.4380e-02,
-1.8023e-02, -6.0320e-02, 1.4030e-02, -4.0541e-02, 4.6965e-03,
7.1572e-03, 1.0316e-02, -7.6909e-02, -5.5507e-02, -6.4332e-02,
-6.2764e-02, 2.3172e-02, 1.5215e-02, -1.5576e-02, 2.3396e-02,
-5.4251e-02, 1.7465e-02, -9.1552e-02, -1.4350e-01, -1.5228e-02,
-5.0016e-02, 1.5546e-02, 1.9486e-02, -2.2702e-02, -6.0833e-02,
1.8424e-02, 4.1719e-02, 3.1578e-02, 2.6568e-02, -4.9155e-02,
-5.2004e-02, -1.8590e-02, -2.7371e-02, 3.8227e-02, 3.2638e-02,
7.9873e-03, 4.5671e-02, 2.4781e-02, -6.7724e-02, -7.6685e-02,
1.3213e-02, 1.9150e-02, 2.0911e-02, 4.8548e-03, 5.5948e-02,
2.9883e-02, 2.2585e-02, 1.0647e-02, 9.4530e-03, -1.6939e-02,
4.8591e-02, 2.6256e-02, 4.8367e-02, 5.7640e-02, 1.4820e-02,
1.0206e-02, 2.1576e-02, -6.3301e-02, -6.1438e-02, 4.9681e-02,
-1.4290e-02, 9.2644e-03, 4.7036e-02, 2.7807e-02, -4.7537e-02,
2.8718e-02, 3.9035e-02, -6.9315e-02, 2.0267e-02, 9.3887e-03,
-2.3518e-03, 3.0030e-02, 2.0438e-02, 4.7360e-03, -1.5699e-02,
-7.5235e-02, 1.8405e-02, -5.7478e-03, 2.8843e-02, 4.1911e-02,
-6.1657e-02, -5.3779e-02, 1.2746e-02, 2.4689e-02, 2.3149e-02,
3.2983e-02, -5.4079e-02, 2.3033e-02, -1.2222e-02, -1.3194e-02,
-4.7920e-02, 3.9478e-02, -5.1594e-02, 1.0203e-02, 8.6237e-04,
-1.2024e-03, -5.9529e-02, 1.3870e-02, -6.7391e-02, -7.4410e-02,
9.1564e-03, 2.5374e-02, -8.6928e-02, 3.2397e-02, -4.7997e-02,
-1.4516e-02, -6.2727e-02, 4.8488e-02, 6.5368e-02, -2.2742e-02,
3.6199e-02, -7.3590e-02]]).to(device)
# Input text
text = "Hello, how are you doing?"
# Process text
inputs = processor(text=text, return_tensors="pt").to(device)
# Generate speech
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
# Save to file
if speech.dim() == 1:
speech = speech.unsqueeze(0)
torchaudio.save("output.wav", speech.cpu(), sample_rate=16000)
Nom
Base model
microsoft/speecht5_tts