@inproceedings{16ffb1783c614445b6471dead28d9bc8,
title = "Video Captioning with Semantic Guiding",
abstract = "Video captioning is to generate descriptions of videos. Most existing approaches adopt the encoder-decoder architecture, which usually use different kinds of visual features, such as temporal features and motion features, but they neglect the abundant semantic information in the video. To address this issue, we propose a framework that jointly explores visual features and semantic attributions named Semantic Guiding Long Short-Term Memory (SG-LSTM). The proposed SG-LSTM has two semantic guiding layers, both of them use three types of semantic - global semantic, object semantic and verb semantic - attributes to guide language model to use the most relevant representation to generate sentences. We evaluate our method on the public available challenging Youtube2Text dataset. Experimental results shown that our framework outperforms the state-of-the-art methods.",
keywords = "neural network, semantic attributes, sequence learning, Video captioning",
author = "Jin Yuan and Chunna Tian and Xiangnan Zhang and Yuxuan Ding and Wei Wei",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 4th IEEE International Conference on Multimedia Big Data, BigMM 2018 ; Conference date: 13-09-2018 Through 16-09-2018",
year = "2018",
month = oct,
day = "18",
doi = "10.1109/BigMM.2018.8499357",
language = "英语",
series = "2018 IEEE 4th International Conference on Multimedia Big Data, BigMM 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2018 IEEE 4th International Conference on Multimedia Big Data, BigMM 2018",
}