@inproceedings{c0ef3e1e5e114c358d587d50ab66d198,
title = "KALL-E: Autoregressive Speech Synthesis with Next-Distribution Prediction",
abstract = "We introduce KALL-E, a novel autoregressive (AR) language model for text-to-speech (TTS) synthesis that operates by predicting the next distribution of continuous speech frames. Unlike existing methods, KALL-E directly models the continuous speech distribution conditioned on text, eliminating the need for any diffusion-based components. Specifically, we utilize a Flow-VAE to extract a continuous latent speech representation from waveforms, instead of relying on discrete speech tokens. A single AR Transformer is then trained to predict these continuous speech distributions from text, optimizing a Kullback–Leibler divergence loss as its objective. Experimental results demonstrate that KALL-E achieves superior speech synthesis quality and can even adapt to a target speaker from just a single sample. Importantly, KALL-E provides a more direct and effective approach for utilizing continuous speech representations in TTS.",
author = "Kangxiang Xia and Xinfa Zhu and Jixun Yao and Wenjie Tian and Wenhao Li and Lei Xie",
note = "Publisher Copyright: {\textcopyright} 2026, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.; 40th AAAI Conference on Artificial Intelligence, AAAI 2026 ; Conference date: 20-01-2026 Through 27-01-2026",
year = "2026",
doi = "10.1609/aaai.v40i40.40695",
language = "英语",
isbn = "9781577359067",
series = "Proceedings of the AAAI Conference on Artificial Intelligence",
publisher = "Association for the Advancement of Artificial Intelligence",
number = "40",
pages = "34016--34024",
editor = "Sven Koenig and Chad Jenkins and Taylor, \{Matthew E.\}",
booktitle = "Proceedings of the AAAI Conference on Artificial Intelligence",
edition = "40",
}