@inproceedings{92c8ea8d35484bc49f762c4b750c629f,
title = "Deep audio-visual system for closed-setword-level speech recognition",
abstract = "Audio-visual understanding is usually challenged by the complementary gap between audio and visual informative bridging. Motivated by the recent audio-visual studies, a closed-set word-level speech recognition scheme is proposed for the Mandarin Audio-Visual Speech Recognition (MAVSR) Challenge in this study. To achieve respective audio and visual encoder initialization more effectively, a 3-dimensional convolutional neural network (CNN) and an attention-based bi-directional long short-term memory (Bi-LSTM) network are trained. With two fully connected layers in addition to the concatenated encoder outputs for the audio-visual joint training, the proposed scheme won the first place with a relative word accuracy improvement of 7.9% over the solitary audio system. Experiments on LRW-1000 dataset have substantially demonstrated that the proposed joint training scheme by audio-visual incorporation is capable of enhancing the recognition performance of relatively short duration samples, unveiling the multi-modal complementarity.",
keywords = "Audio-visual, Convolutional neural network, Long short-term memory, Multi-model",
author = "Yougen Yuan and Wei Tang and Minhao Fan and Yue Cao and Peng Zhang and Lei Xie",
note = "Publisher Copyright: {\textcopyright} 2019 Association for Computing Machinery.; 21st ACM International Conference on Multimodal Interaction, ICMI 2019 ; Conference date: 14-10-2019 Through 18-10-2019",
year = "2019",
month = oct,
day = "14",
doi = "10.1145/3340555.3356102",
language = "英语",
series = "ICMI 2019 - Proceedings of the 2019 International Conference on Multimodal Interaction",
publisher = "Association for Computing Machinery, Inc",
pages = "540--545",
editor = "Wen Gao and {Ling Meng}, {Helen Mei} and Matthew Turk and Fussell, {Susan R.} and Bjorn Schuller and Bjorn Schuller and Yale Song and Kai Yu",
booktitle = "ICMI 2019 - Proceedings of the 2019 International Conference on Multimodal Interaction",
}