@article{ref1,
title="Crimenet: Neural structured learning using vision transformer for violence detection",
journal="Neural networks",
year="2023",
author="Rendón-Segador, Fernando J. and Álvarez-García, Juan A. and Salazar-González, Jose L. and Tommasi, Tatiana",
volume="161",
number="",
pages="318-329",
abstract="The state of the art in violence detection in videos has improved in recent years thanks to deep learning models, but it is still below 90% of average precision in the most complex datasets, which may pose a problem of frequent false alarms in video surveillance environments and may cause security guards to disable the artificial intelligence system. In this study, we propose a new neural network based on Vision Transformer (ViT) and Neural Structured Learning (NSL) with adversarial training. This network, called CrimeNet, outperforms previous works by a large margin and reduces practically to zero the false positives. Our tests on the four most challenging violence-related datasets (binary and multi-class) show the effectiveness of CrimeNet, improving the state of the art from 9.4 to 22.17 percentage points in ROC AUC depending on the dataset. In addition, we present a generalisation study on our model by training and testing it on different datasets. The obtained results show that CrimeNet improves over competing methods with a gain of between 12.39 and 25.22 percentage points, showing remarkable robustness.<p /> <p>Language: en</p>",
language="en",
issn="0893-6080",
doi="10.1016/j.neunet.2023.01.048",
url="http://dx.doi.org/10.1016/j.neunet.2023.01.048"
}