@inproceedings{b4be69a0a9d6497c9247607175c496ab,
title = "WeNet: Production oriented streaming and non-streaming end-to-end speech recognition toolkit",
abstract = "In this paper, we propose an open source speech recognition toolkit called WeNet, in which a new two-pass approach named U2 is implemented to unify streaming and non-streaming end-to-end (E2E) speech recognition in a single model. The main motivation of WeNet is to close the gap between the research and deployment of E2E speech recognition models. WeNet provides an efficient way to ship automatic speech recognition (ASR) applications in real-world scenarios, which is the main difference and advantage to other open source E2E speech recognition toolkits. We develop a hybird connectionist temporal classification (CTC)/attention architecture with transformer or conformer as encoder and an attention decoder to rescore th CTC hypotheses. To achieve streaming and non-streaming in a unified model, we use a dynamic chunk-based attention strategy which allows the self-attention to focus on the right context with random length. Our experiments on the AISHELL-1 dataset show that our model achieves 5.03% relative character error rate (CER) reduction in non-streaming ASR compared to a standard non-streaming transformer. After model quantification, our model achieves reasonable RTF and latency at runtime. The toolkit is publicly available at https://github.com/mobvoi/wenet.",
keywords = "Production oriented, U2, WeNet",
author = "Zhuoyuan Yao and Di Wu and Xiong Wang and Binbin Zhang and Fan Yu and Chao Yang and Zhendong Peng and Xiaoyu Chen and Lei Xie and Xin Lei",
note = "Publisher Copyright: Copyright {\textcopyright} 2021 ISCA.; 22nd Annual Conference of the International Speech Communication Association, INTERSPEECH 2021 ; Conference date: 30-08-2021 Through 03-09-2021",
year = "2021",
doi = "10.21437/Interspeech.2021-1983",
language = "英语",
series = "Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH",
publisher = "International Speech Communication Association",
pages = "2093--2097",
booktitle = "22nd Annual Conference of the International Speech Communication Association, INTERSPEECH 2021",
}