@inproceedings{a03bdafe8fcd43299ea028cdf8d90cdc,
title = "Multi-speaker Multi-style Text-to-speech Synthesis with Single-speaker Single-style Training Data Scenarios",
abstract = "In the existing cross-speaker style transfer task, a source speaker with multi-style recordings is necessary to provide the style for a target speaker. However, it is hard for one speaker to express all expected styles. In this paper, a more general task, which is to produce expressive speech by combining any styles and timbres from a multi-speaker corpus in which each speaker has a unique style, is proposed. To realize this task, a novel method is proposed. This method is a Tacotron2-based framework but with a fine-grained text-based prosody predicting module and a speaker identity controller. Experiments demonstrate that the proposed method can successfully express a style of one speaker with the timber of another speaker bypassing the dependency on a single speaker's multi-style corpus. Moreover, the explicit prosody features used in the prosody predicting module can increase the diversity of synthetic speech by adjusting the value of prosody features.",
keywords = "multi-speaker, multi-style, speech synthesis",
author = "Qicong Xie and Tao Li and Xinsheng Wang and Zhichao Wang and Lei Xie and Guoqiao Yu and Guanglu Wan",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022 ; Conference date: 11-12-2022 Through 14-12-2022",
year = "2022",
doi = "10.1109/ISCSLP57327.2022.10038056",
language = "英语",
series = "2022 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "66--70",
editor = "Lee, {Kong Aik} and Hung-yi Lee and Yanfeng Lu and Minghui Dong",
booktitle = "2022 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022",
}