@inproceedings{3e0792dcfd344459a8aa34ae0a62fe19,
title = "Sa-Paraformer: Non-Autoregressive End-To-End Speaker-Attributed ASR",
abstract = "Joint modeling of multi-speaker ASR and speaker diarization has recently shown promising results in speaker-attributed automatic speech recognition (SA-ASR). Although being able to obtain state-of-the-art (SOTA) performance, most of the studies are based on an autoregressive (AR) decoder which generates tokens one-by-one and results in a large real-time factor (RTF). To speed up inference, we introduce a recently proposed non-autoregressive model Paraformer as an acoustic model in the SA-ASR model. Paraformer uses a single-step decoder to enable parallel generation, obtaining comparable performance to the SOTA AR transformer models. Besides, we propose a speaker-filling strategy to reduce speaker identification errors and adopt an inter-CTC strategy to enhance the encoder's ability in acoustic modeling. Experiments on the AliMeeting corpus show that our model outperforms the cascaded SA-ASR model by a 6.1% relative speaker-dependent character error rate (SD-CER) reduction on the test set. Moreover, our model achieves a comparable SD-CER of 34.8% with only 1/10 RTF compared with the SOTA joint AR SA-ASR model.",
keywords = "AliMeeting, multi-speaker ASR, non-autoregressive, Speaker-attributed ASR",
author = "Yangze Li and Fan Yu and Yuhao Liang and Pengcheng Guo and Mohan Shi and Zhihao Du and Shiliang Zhang and Lei Xie",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2023 ; Conference date: 16-12-2023 Through 20-12-2023",
year = "2023",
doi = "10.1109/ASRU57964.2023.10389762",
language = "英语",
series = "2023 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2023 IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2023",
}