
@article{ref1,
title="Ensemble-based model selection for imbalanced data to investigate the contributing factors to multiple fatality road crashes in Ghana",
journal="Accident analysis and prevention",
year="2020",
author="Yahaya, Mahama and Guo, Runhua and Jiang, Xinguo and Bashir, Kamal and Matara, Caroline and Xu, Shiwei",
volume="151",
number="",
pages="e105851-e105851",
abstract="The study aims to identify relevant variables to improve the prediction performance of the crash injury severity (CIS) classification model. Unfortunately, the CIS  database is invariably characterized by the class imbalance. For instance, the  samples of multiple fatal injury (MFI) severity class are typically rare as opposed  to other classes. The imbalance phenomenon may introduce a prediction bias in favour  of the majority class and affect the quality of the learning algorithm. The paper  proposes an ensemble-based variable ranking scheme that incorporates the data  resampling. At the data pre-processing level, majority weighted minority  oversampling (MWMOTE) is employed to treat the imbalanced training data. Ensemble of  classifiers induced from the balanced data is used to evaluate and rank the  individual variables according to their importance to the injury severity  prediction. The relevant variables selected are then applied to the balanced data to  form a training set for the CIS classification modelling. An empirical comparison is  conducted through considering the variable ranking by: 1) the learning of single  inductive algorithm with imbalanced data where the relevant variables are applied to  the imbalanced data to form the training data; 2) the learning of single inductive  algorithm with MWMOTE data and the relevant variables identified are applied to the  balanced data to form the training data; and 3) the learning of ensembles with  imbalanced data where the relevant variables identified are applied to the  imbalanced data to form the training data. Bayesian Networks (BNs) classifiers are  then developed for each ranking method, where nested subsets of the top ranked  variables are adopted. The model predictions are captured in four performance  indicators in the comparative study. Based on three-year (2014-2016) crash data in  Ghana, the empirical results show that the proposed method is effective to identify  the most prolific predictors of the CIS level. Finally, based on the inference  results of BNs developed on the best subset, the study offers the most probable  explanations to the occurrence of MFI crashes in Ghana.<p /> <p>Language: en</p>",
language="en",
issn="0001-4575",
doi="10.1016/j.aap.2020.105851",
url="http://dx.doi.org/10.1016/j.aap.2020.105851"
}