@inproceedings{0af5fec30350465797d9dfcd3341c82e,
title = "TeNC: Low Bit-Rate Speech Coding with VQ-VAE and GAN",
abstract = "Speech coding aims at compressing digital speech signals with fewer bits and reconstructing it back to raw signals, maintaining the speech quality as much as possible. But conventional codecs usually need a high bit-rate to achieve reconstructed speech with reasonable high quality. In this paper, we propose an end-to-end neural generative codec with a VQ-VAE based auto-encoder and the generative adversarial network (GAN), which achieves reconstructed speech with high-fidelity at a low bit-rate about 2 kb/s. The compression process of speech coding is carried out by a down-sampling module of the encoder and a learnable discrete codebook. GAN is used to further improve the reconstructed quality. Our experiments confirm the effectiveness of the proposed model in both objective and subjective tests, which significantly outperforms the conventional codecs at low bit-rate in terms of speech quality and speaker similarity.",
keywords = "Codec, GAN, VQ-VAE, low bit-rate, neural speech coding",
author = "Yi Chen and Shan Yang and Na Hu and Lei Xie and Dan Su",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 23rd ACM International Conference on Multimodal Interaction, ICMI 2021 ; Conference date: 18-10-2021 Through 22-10-2021",
year = "2021",
month = oct,
day = "18",
doi = "10.1145/3461615.3491114",
language = "英语",
series = "ICMI 2021 Companion - Companion Publication of the 2021 International Conference on Multimodal Interaction",
publisher = "Association for Computing Machinery, Inc",
pages = "126--130",
booktitle = "ICMI 2021 Companion - Companion Publication of the 2021 International Conference on Multimodal Interaction",
}