@inproceedings{d9971aaeceee4732b7781b838a6bd612,
title = "Conversational End-to-End TTS for Voice Agents",
abstract = "End-to-end neural TTS has achieved excellent performance on reading style speech synthesis. However, it is still a challenge to build a high-quality conversational TTS due to the limitations of corpus and modeling capability. This study aims at building a conversational TTS for a voice agent under sequence to sequence modeling framework. We firstly construct a spontaneous conversational speech corpus well designed for the voice agent with a new recording scheme ensuring both recording quality and conversational speaking style. Secondly, we propose a conversation context-aware end-to-end TTS approach that employs an auxiliary encoder and a conversational context encoder to specifically reinforce the information about the current utterance and its context in a conversation as well. Experimental results show that the proposed approach produces more natural prosody in accordance with the conversational context, with significant preference gains at both utterance-level and conversation-level. Moreover, we find that the model has the ability to express some spontaneous behaviors like fillers and repeated words, which makes the conversational speaking style more realistic.",
keywords = "Conversational TTS, End-to-End, Speech Corpus, Text-to-Speech, Voice Agent",
author = "Haohan Guo and Shaofei Zhang and Soong, {Frank K.} and Lei He and Lei Xie",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 2021 IEEE Spoken Language Technology Workshop, SLT 2021 ; Conference date: 19-01-2021 Through 22-01-2021",
year = "2021",
month = jan,
day = "19",
doi = "10.1109/SLT48900.2021.9383460",
language = "英语",
series = "2021 IEEE Spoken Language Technology Workshop, SLT 2021 - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "403--409",
booktitle = "2021 IEEE Spoken Language Technology Workshop, SLT 2021 - Proceedings",
}