jarvis/scratch/tts.py

import torchaudio
from speechbrain.pretrained import Tacotron2
from speechbrain.pretrained import HIFIGAN

import sounddevice as sd

# Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")

# Running the TTS
mel_output, mel_length, alignment = tacotron2.encode_text("This is an open-source toolkit for the development of speech technologies.")

# Running Vocoder (spectrogram-to-waveform)
waveforms = hifi_gan.decode_batch(mel_output)
print(waveforms)
#Audio(waveforms.detach().cpu().squeeze(), rate=22050)
# sd.play(waveforms, 22050)
torchaudio.io.play_audio(waveform=waveforms, sample_rate=22050)
# torchaudio.Audio(waveforms.detach().cpu().squeeze(), rate=22050)