@inproceedings{b167a7859ff84bef82c8b59141bf266f,
title = "Maximum lexical cohesion for fine-grained news story segmentation",
abstract = "We propose a maximum lexical cohesion (MLC) approach to news story segmentation. Unlike sentence-dependent lexical methods, our approach is able to detect story boundaries at finer word/subword granularity, and thus is more suitable for speech recognition transcripts which have no sentence delimiters. The proposed segmentation goodness measure takes account of both lexical cohesion and a prior preference of story length. We measure the lexical cohesion of a segment by the KL-divergence from its word distribution to an associated piecewise uniform distribution. Taking account of the uneven contributions of different words to a story, the cohesion measure is further refined by two word weighting schemes, i.e. the inverse document frequency (IDF) and a new weighting method called difference from expectation (DFE). We then propose a dynamic programming solution to exactly maximize the segmentation goodness and efficiently locate story boundaries in polynomial time. Experimental results show that our MLC approach outperforms several state-of-the-art lexical methods.",
keywords = "Dynamic programming, KL-divergence, Lexical cohesion, Spoken document retrieval, Spoken document segmentation, Story segmentation, Word weighting",
author = "Zihan Liu and Lei Xie and Wei Feng",
year = "2010",
language = "英语",
series = "Proceedings of the 11th Annual Conference of the International Speech Communication Association, INTERSPEECH 2010",
publisher = "International Speech Communication Association",
pages = "1301--1304",
booktitle = "Proceedings of the 11th Annual Conference of the International Speech Communication Association, INTERSPEECH 2010",
}