@inproceedings{3328c367308f4bd98ecf0ac0f8eab3c1,
title = "Machine learning based performance analysis and prediction of jobs on a HPC cluster",
abstract = "There are a lot of middle-class or small-class high-performance computing clusters at universities and research institutes, etc. Large volumes of job logs have been accumulated after many years of operation. In this paper, on the basis of accumulated job logs on a high-performance computing cluster, we examine and analyze the job logs. Then, we study machine learning based performance analysis and prediction methods for parallel jobs. Various machine learning methods such as multivariate linear fitting, artificial neural network are used to build performance prediction models. We compare the errors of each model, and select the optimal prediction model for different users. The experimental results show that we can obtain reasonable prediction accuracy using the selected machine learning algorithms.",
keywords = "HPC cluster, Job log, Machine learning, Performance analysis, Performance prediction",
author = "Zhengxiong Hou and Shuxin Zhao and Chao Yin and Yunlan Wang and Jianhua Gu and Xingshe Zhou",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 20th International Conference on Parallel and Distributed Computing, Applications and Technologies, PDCAT 2019 ; Conference date: 05-12-2019 Through 07-12-2019",
year = "2019",
month = dec,
doi = "10.1109/PDCAT46702.2019.00053",
language = "英语",
series = "Proceedings - 2019 20th International Conference on Parallel and Distributed Computing, Applications and Technologies, PDCAT 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "247--252",
editor = "Hui Tian and Hong Shen and Tan, {Wee Lum}",
booktitle = "Proceedings - 2019 20th International Conference on Parallel and Distributed Computing, Applications and Technologies, PDCAT 2019",
}