@inproceedings{dd6bfb4d5c8844eb96aa5be40847f14f,
title = "TEA-PSE 2.0: Sub-Band Network for Real-Time Personalized Speech Enhancement",
abstract = "Personalized speech enhancement (PSE) utilizes additional cues like speaker embeddings to remove background noise and interfering speech and extract the speech from target speaker. Previous work, the Tencent-Ethereal-Audio-Lab personalized speech enhancement (TEA-PSE) system, ranked 1st in the ICASSP 2022 deep noise suppression (DNS2022) challenge. In this paper, we expand TEA-PSE to its sub-band version - TEA-PSE 2.0, to reduce computational complexity as well as further improve performance. Specifically, we adopt finite impulse response filter banks and spectrum splitting to reduce computational complexity. We introduce a time frequency convolution module (TFCM) to the system for increasing the receptive field with small convolution kernels. Besides, we explore several training strategies to optimize the two-stage network and investigate various loss functions in the PSE task. TEA-PSE 2.0 significantly outperforms TEA-PSE in both speech enhancement performance and computation complexity. Experimental results on the DNS2022 blind test set show that TEA-PSE 2.0 brings 0.102 OVRL personalized DNSMOS improvement with only 21.9% multiply-accumulate operations compared with the previous TEA-PSE.",
keywords = "deep learning, personalized speech enhancement, real-time, sub-band",
author = "Yukai Ju and Shimin Zhang and Wei Rao and Yannan Wang and Tao Yu and Lei Xie and Shidong Shang",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2022 IEEE Spoken Language Technology Workshop, SLT 2022 ; Conference date: 09-01-2023 Through 12-01-2023",
year = "2023",
doi = "10.1109/SLT54892.2023.10023174",
language = "英语",
series = "2022 IEEE Spoken Language Technology Workshop, SLT 2022 - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "472--479",
booktitle = "2022 IEEE Spoken Language Technology Workshop, SLT 2022 - Proceedings",
}