@inproceedings{11c5f5aea2bf46cc8404eb1c09fdd9b3,
title = "Controlling Emotion Strength with Relative Attribute for End-To-End Speech Synthesis",
abstract = "Recently, attention-based end-To-end speech synthesis has achieved superior performance compared to traditional speech synthesis models, and several approaches like global style tokens are proposed to explore the style controllability of the end-To-end model. Although the existing methods show good performance in style disentanglement and transfer, it is still unable to control the explicit emotion of generated speech. In this paper, we mainly focus on the subtle control of expressive speech synthesis, where the emotion category and strength can be easily controlled with a discrete emotional vector and a continuous simple scalar, respectively. The continuous strength controller is learned by a ranking function according to the relative attribute measured on an emotion dataset. Our method automatically learns the relationship between low-level acoustic features and high-level subtle emotion strength. Experiments show that our method can effectively improve the controllability for an expressive end-To-end model.",
keywords = "Emotion strength, end-To-end, relative attributes, speech synthesis, text-To-speech",
author = "Xiaolian Zhu and Shan Yang and Geng Yang and Lei Xie",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 2019 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2019 ; Conference date: 15-12-2019 Through 18-12-2019",
year = "2019",
month = dec,
doi = "10.1109/ASRU46091.2019.9003829",
language = "英语",
series = "2019 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2019 - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "192--199",
booktitle = "2019 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2019 - Proceedings",
}