<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">An integrated intelligent surveillance system for Industrial areas</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Francesco</forename><surname>Camastra</surname></persName>
							<email>francesco.camastra@uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Angelo</forename><surname>Ciaramella</surname></persName>
							<email>angelo.ciaramella@uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Angelo</forename><surname>Casolaro</surname></persName>
							<email>angelo.casolaro001@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Pasquale</forename><surname>De Trino</surname></persName>
							<email>pasquale.detrino001@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Alessio</forename><surname>Ferone</surname></persName>
							<email>alessio.ferone@uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giovanni</forename><surname>Hauber</surname></persName>
							<email>giovanni.hauber@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Gennaro</forename><surname>Iannuzzo</surname></persName>
							<email>gennaro.iannuzzo001@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Vincenzo</forename><forename type="middle">Mariano</forename><surname>Scarrica</surname></persName>
							<email>vincenzomariano.scarrica001@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Antonio</forename><forename type="middle">Junior</forename><surname>Spoleto</surname></persName>
							<email>antoniojunior.spoleto001@studenti.uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Antonino</forename><surname>Staiano</surname></persName>
							<email>antonino.staiano@uniparthenope.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Maria</forename><forename type="middle">Concetta</forename><surname>Vitale</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Science and Technology</orgName>
								<orgName type="institution">Parthenope University of Naples</orgName>
								<address>
									<addrLine>Centro Direzionale Isola C4</addrLine>
									<postCode>80143</postCode>
									<settlement>Naples</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">An integrated intelligent surveillance system for Industrial areas</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">13923930DC184E04D41E8975E84EF979</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T16:57+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Plate Detection, Face Detection, Fall Detection, Parking Detection Orcid 0000-0003-4439-7583 (F. Camastra)</term>
					<term>0000-0001-5592-7995 (A. Ciaramella)</term>
					<term>0000-0002-7577-6765 (A. Casolaro)</term>
					<term>0009-0003-0680-4501 (P. D. Trino)</term>
					<term>0000-0002-4883-0164 (A. Ferone)</term>
					<term>0009-0007-0137-3182 (G. Hauber)</term>
					<term>0009-0003-5962-8302 (G. Iannuzzo)</term>
					<term>0009-0008-4640-2693 (V. M. Scarrica)</term>
					<term>0009-0007-4037-7821 (A. J. Spoleto)</term>
					<term>0000-0002-4708-5860 (A. Staiano)</term>
					<term>0000-0002-5538-9952 (M. C. Vitale)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This paper presents the design and implementation phases of a software prototype developed by the University of Parthenope for the SE4I project (Smart Energy Efficiency &amp; Environment for Industry), funded by the "Progetti di ricerca industriale e lo Sviluppo sperimentale" (PNR 2015(PNR  -2020)). The prototype leverages advanced computer vision techniques based on deep learning architectures to address industrial security and monitoring needs. Specifically, the prototype tackles three key functionalities, (1) personnel and vehicle identification: The system recognizes authorized personnel and vehicle license plates within video streams captured in restricted industrial areas; (2) anomaly detection: The software can detect various anomalies in video feeds, including falls of personnel in monitored zones and unattended objects left in unauthorized areas;</p><p>(3) smart parking management: The prototype identifies vacant parking spaces within camera-monitored zones, enabling efficient parking management. These functionalities are integrated into the software prototype, and its performance has been thoroughly evaluated.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>The SE4I project aims to improve safety within a designated industrial area by implementing a real-time video monitoring system. This system uses strategically placed smart poles equipped with RGB cameras to capture video streams. The project focuses on three key functionalities (see Fig. <ref type="figure" target="#fig_0">1</ref>): (a) authorized access control: The system will recognize individuals and vehicle license plates. This ensures that only authorized personnel and vehicles can enter the area, likely through controlled access points with barriers. Upon arrival, an employee's car triggers the system. The camera mounted on the smart pole captures the RGB video stream of the scene, using AI to identify the license plate and the driver's face. Access is then granted only after successful recognition. The combined recognition of license plate and driver verifies that the vehicle and driver are authorized. If recognition is successful, access is allowed; if not, access is denied; (b) anomaly detection: This use case focuses on detecting abnormal behavior or events in the video streams captured by the pole-mounted cameras. These anomalies can range from environmental violations, such as illegal dumping of waste to personnel safety issues, e.g., persons' falls, and unattended objects left in restricted areas. An RGB video stream from a smart pole camera continuously feeds the scene, which may include both facility personnel and outsiders, to a hardware component equipped with AI modules. The intelligent module analyzes the video to identify unusual elements. Upon detection, an alert is sent to a central control station that specifies the type and location of the event. This allows immediate assistance to personnel experiencing medical emergencies or accidents and swift intervention in case of suspicious activity; (c) smart parking management: This use case addresses the detection and management of parking availability within the vast industrial area. Due to its size, automated and intelligent parking lot monitoring is crucial. This system will inform users about free parking spaces as they approach designated parking spaces. In the context of performing learning tasks from video streams in surveillance and security appli- cations, the state of the art is represented by computer vision techniques based on the use of deep learning techniques <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>.In the subsequent sections, we will discuss the proposed solutions for each of the aforementioned tasks.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Plate Detection</head><p>The objective is to recognize vehicles and their license plates from a surveillance video feed and retrieve the associated alphanumeric sequence. This sequence is subsequently utilized for additional vehicle recognition within the system.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Challenges</head><p>Automatic license plate recognition faces several hurdles, specifically, (a) Variable Lighting: Extreme brightness, low light, and shadows can significantly reduce plate visibility. The systems address this with techniques like adaptive thresholding and contrast enhancement; (b) Car Position: Vehicles approach cameras at various angles and distances. Sophisticated algorithms are required for accurate plate localization and perspective correction to account for these variations; (c) Occlusions: Objects like bumpers, dirt, or even other vehicles can partially or fully obscure the plate. Robust object detection and diverse training data are crucial to overcome these occlusions; (d) Font Diversity: License plate formats and fonts differ significantly across countries and even regions. Training models on a wide variety of datasets is essential for generalization across different plate styles.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Methods</head><p>The proposed approach consists of three key steps: vehicle detection, license plate (LP) detection, and optical character recognition (OCR), as shown in Figure <ref type="figure" target="#fig_1">2</ref>. In the first step, the system detects vehicles in the scene using a dedicated module. Within each detected vehicle region, the Warped Planar Object Detection Network (WPOD-NET) <ref type="bibr" target="#b2">[3]</ref> is used to search for license plates. The WPOD-NET performs affine transformations to rectify the LP area to resemble a frontal view. These detections are then passed to an OCR network for accurate character recognition and extraction. To balance computation time and performance, we chose YoloV4 <ref type="bibr" target="#b3">[4]</ref>. For the classification problem, we treated the network as a closed system, consolidating the outputs specifically related to vehicles such as cars, buses, and motorcycles, while ignoring outputs related to other classes. The design of WPOD-NET, which is responsible for warping the license plate into a rectangular shape, was influenced by insights from YOLO, SSD <ref type="bibr" target="#b4">[5]</ref>, and Spatial Transformer Networks (STN) <ref type="bibr" target="#b5">[6]</ref>. Finally, in our OCR module, we used Tesseract <ref type="bibr" target="#b6">[7]</ref>, a fine-tuned optical character recognition engine trained on our license plate character dataset. Tesseract's advantage over a simple CNN lies in its recurrent neural network (RNN) architecture <ref type="bibr" target="#b7">[8]</ref>, which takes into account the sequential nature of the characters on a license plate. This allows for accurate recognition as the RNN captures the contextual dependencies between characters. Tesseract's extensive training on diverse datasets makes it robust, handling different font styles, sizes, and noise levels commonly found in license plate images.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Execution</head><p>The modules are designed for real-time execution in an embedded system environment, given the strict time constraints imposed by vehicle identification. Fig. <ref type="figure" target="#fig_2">3</ref> illustrates the time required for the execution steps. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Face Recognition</head><p>Our research team has developed a framework for secure access control in the industrial area as a smart city environment. The framework leverages surveillance cameras positioned at entry points to industrial areas. It aims to match detected faces with the license plates of associated vehicles. This ensures that the driver corresponds to the registered vehicle owner, improving access control security.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Challenges</head><p>The framework faces different issues, particularly with on-board processing. In addition, reflective surfaces and occlusions caused by sunlight on vehicle windshields can hinder facial recognition.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Methods</head><p>The face recognition process is divided into two main steps, namely, face localization and face classification. In the localization step, faces are accurately localized within an image by a Multi-task Cascaded Convolutional Networks (MTCNN) algorithm <ref type="bibr" target="#b8">[9]</ref>. Its unique cascade structure consists of three stages: Proposal Network (P-Net), Refinement Network (R-Net), and Output Network (O-Net). By simultaneously performing multiple tasks such as face detection, bounding box regression, and face landmark localization, MTCNN ensures a thorough and accurate face identification. In particular, it excels at detecting faces at different scales and orientations while maintaining impressive computational efficiency, making it ideal for real-time applications.</p><p>For face classification, an extremely effective approach combines two different algorithms. The first is a face alignment with an ensemble of regression trees <ref type="bibr" target="#b9">[10]</ref>. Using an ensemble of regression trees, the algorithm predicts the positions of facial landmarks directly from image data, bypassing traditional optimization methods. The second uses FaceNet <ref type="bibr" target="#b10">[11]</ref>, which efficiently maps a face into a continuous embedding space, i.e., converting it into a 128-feature embedding vector. This vector is then matched to a face in the database using a one-shot approach (see Fig. <ref type="figure" target="#fig_3">4</ref> as an example of qualitative result on a test image.)</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Anomaly Detection</head><p>Anomaly detection involves the identification of unusual items, events, or observations that are significantly different from the norm or expected behavior, or that indicate unusual conditions. The activities conducted in the SE4I project focused on identifying waste dumping where access is prohibited, and fall detection. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Challenges</head><p>Anomaly recognition in video streams presents a significant challenge: identifying rare and short-lived events that deviate from the norm. These anomalies often occur for just a few seconds, making them difficult for humans to detect and nearly impossible to capture in a single, universal model. The vast number of possible anomaly types, locations, and contexts makes defining a comprehensive model impractical. It would require an enormous amount of data and manual effort. A more effective approach is to train models that can differentiate between normal and abnormal activity, regardless of the specific anomaly type. This approach leverages the fact that normal behavior typically occurs far more frequently than anomalies.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Methods</head><p>We have used a reconstruction-based method, where a model is trained to learn the normal patterns of the data, to be able to reconstruct them when new frames are presented. The model is designed to extract the spatiotemporal structure of the video stream to accurately learn the pattern of normality for a scene without anomalies <ref type="bibr" target="#b11">[12]</ref>(without the abnormal situation highlighted in red in Figure <ref type="figure" target="#fig_4">5</ref>). During testing, the model provides information about the most anomalous areas of the video by comparing input frames with reconstructed frames. A relatively low score indicates a normal scene, while a high score indicates the presence of anomalies. The goal is to create an end-to-end model capable of learning the spatio-temporal patterns of the analyzed data and predicting when an event is anomalous compared to the learned normality. The model used for this unsupervised analysis is called CLSTM-AE, which stands for Long-Short Term Memory Convolutional-Transpose Convolutional Autoencoder. The architecture is a special type of autoencoder <ref type="bibr" target="#b12">[13]</ref>. This approach enables real-time anomaly detection by identifying events that deviate significantly from the learned representation of normal video data. The trained model compares the reconstructed video frames with the original input. For normal events, the reconstructed frames closely resemble the originals, with minimal differences in pixel values. However, when anomalies occur, the network's reconstruction becomes less accurate. This is reflected in blurry or distorted frames compared to the originals. By analyzing this reconstruction error (the difference between original and reconstructed frames), one can identify anomalies in real-time.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Fall Detection</head><p>Falls, especially outdoors where help might be delayed, can lead to serious injuries. Traditional fall detection systems often rely on wearable sensors, which can be inconvenient or impractical. The SE4I project addresses this challenge with a camera-based fall detection system using an LSTM Autoencoder. This system leverages anomaly detection techniques within a computer vision framework. It essentially learns what "normal" movement looks like and identifies deviations from this norm as potential falls. This approach offers several advantages, specifically, no need for wearable Sensors, only camera-based detection working with existing surveillance infrastructure, and the use of real-time alerts, enabling faster response times.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.1.">Challenges</head><p>Traditional fall detection systems typically rely on wearable sensors or specialized depth cameras. These methods can be intrusive to users and costly to deploy on a large scale. On the other hand, relying solely on human observation through video footage is an option. However, this approach is labor-intensive and requires continuous monitoring.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.2.">Methods</head><p>Our method addresses the previous issues by enabling fall detection with a simpler setup, a standard RGB camera, eliminating the need for specialized equipment, and AI-powered detection that uses a single AI module running on a GPU to analyze the video stream for instances of falls. This approach eliminates the need for wearable devices and reduces the reliance on human intervention. Detection is to be performed in pedestrian areas and parks, and so a dataset was created to fit this particular environment and to train the model on data representing the final context. The training dataset is illustrative of all the normal poses that people take while walking in places like pedestrian areas and parks. The scenes were therefore captured with a fixed camera about 3 meters above the ground, pointing across an open space and covering walking, standing poses, and running events in all directions and with or without obstacles/occlusions. The idea of training a model with only "normal events" is important because, in nature, abnormal events (falls) are very rare and therefore expensive to acquire. The data preprocessing pipeline involves using the openpose framework <ref type="bibr" target="#b13">[14]</ref> to extract skeleton key points from each frame. From each skeleton, irrelevant keypoints are removed as they are considered noisy and skeletons with significant missing keypoints are filtered out. Keypoint coordinates are then normalized using min/max normalization and discretized into coarser bins to provide numerical stability for the training phase. Finally, the data are shaped into time windows using sequences of 75 skeletal frames. Such windows form the basic unit on which the AI model operates. Since the video stream is supposed to be captured at 25 FPS, working with 75 frame windows means analyzing human behavior over 3-second actions. An overlap of 25 frames between consecutive windows is also included to maintain continuity between the windows themselves. The model is based on a LSTM autoencoder <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b15">[16]</ref>. The execution time when running on a consumer GPU allows for real-time performance (see Fig. <ref type="figure" target="#fig_5">6</ref>). Once the model has learned normal human behavior patterns, it can be used to reconstruct time windows. Reconstruction and input data are then compared; if the reconstruction error exceeds a certain threshold and deviates significantly from normal data, the input is intuitively flagged as a fall event. Overall, the results highlight the effectiveness of using learned temporal skeletal patterns for robust anomaly detection in the context of outdoor fall detection.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Parking Detection</head><p>Parking detection for SE4I requires the development of an automatic system that searches for free parking space in one of the parking areas within the industrial area and provides information to drivers who have requested a parking space. The Parking Guide and Information (PGI) system <ref type="bibr" target="#b16">[17]</ref> has been adopted as a solution for the parking detection task using a monitoring system. The proposed PGI system consists of two main parts. The former is based on deep learning instance segmentation model to detect all available free spaces in a parking lot. The latter is a client-server architecture that automatically guides drivers to the closest parking lot with the highest number of available spaces.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.">Challenges</head><p>Parking lot detection systems using video surveillance face several difficulties: (a) the impact of weather, e.g., low visibility caused by fog, rain, and snow can significantly decrease the accuracy of these systems, or harsh weather conditions can obscure parking lot boundaries in the video feed; (b) diverse parking lot data, i.e., training robust parking detection models requires a large dataset with a wide variety of scenarios, including variations in parking space layouts, weather conditions, camera angles, obstructions, parking lot types (e.g., open-air, multistory), and lighting conditions (day/night); (c) real-time processing, that is, for practical applications, the system needs to operate in real-time, this necessitates developing a light parking detection model that can run efficiently on available hardware.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.2.">Methods</head><p>This work conceived a model for parking lot detection using an instance segmentation approach. Yolact++ <ref type="bibr" target="#b17">[18]</ref>, which is an extension of Yolact <ref type="bibr" target="#b18">[19]</ref>, was trained with successful results on a novel dataset appropriately designed for this task. The dataset consists of 1395 images and 23600 manually annotated parking lots, and it was built by using a web-scarping approach. The images, taken from public access cameras, were selected to represent a variety of conditions, i.e. weather and lighting conditions, features, i.e, different camera angles, occlusions, shadows, presence of people or animals, camera heights, satellite imagery in 2D and 3D, different types of lines and colors, and different backgrounds.</p><p>Parking lot detection and car detection are performed simultaneously to classify occupied or free parking lots on the basis of the IoU between parking lot and car masks detected by the Yolact++ module, respectively. For IoU values greater than an IoU threshold, the system classifies parking lots as busy lots, as free lots, otherwise. The Yolact++ architecture is based on the RetinaNet architecture <ref type="bibr" target="#b19">[20]</ref>, using pre-trained ResNet-101 stages. In addition, Yolact++ introduces three improvements over the base model: Fast Mask Re-Scoring Network Stage, Deformable Convolutions with Intervals, and a Optimized Prediction Head. The selection of the Yolact++ architecture for the parking lot detection problem was motivated by the runtime requirements and the accuracy achieved by this instance segmentation model. A client-server system called PGI has been developed. The clients include the drivers, administrators, and machine learning systems. Drivers can search for parking lots, while administrators can add, remove, and monitor parking lots.System operations are performed on the server side, which is built using PHP and MySQL for database storage. Clients connect to the server through a server interface using a Java Android app. The app provides various functionalities, such as guiding drivers to the nearest parking lot with available spaces using GPS, and monitoring areas using the Google StreetView API. The system presents favorable results with low loss values and acceptable mAP for both the box and the mask, determined using a 0.5 IoU threshold (see Table <ref type="table" target="#tab_0">1</ref> and Fig. <ref type="figure" target="#fig_6">7</ref>). Results on Yolact++ after fine-tuning and testing on the custom dataset.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Metrics</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Integration and Infrastructure</head><p>The five intelligent modules of the SE4I project, plate detection, face detection, anomaly detection, fall detection, and parking detection, are part of a larger system powered by a peer-to-peer network of NVIDIA Jetson Xavier devices mounted on multifunctional light poles. This setup ensures efficient and real-time processing of the data collected by the surveillance cameras, as the computation is performed in the field and each device shares data and JSON output with devices on other poles using a ZMQ publisher/subscriber pattern. Therefore, a dedicated module manages the surveillance camera stream and associated metadata, such as brightness, frame rate, contrast, etc. All modules are containerized as Docker solutions, allowing for flexible portability, easy installation, resilience, and scalable performance. The whole system is based on Python and C++ programming languages and PyTorch, OpenCV, OpenPose, Onvif libraries are used. This infrastructure guarantees the real-time requirements and the privacy of the video-monitored areas.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Project tasks: (a) people/vehicles recognition; (b) fall detection; (c) anomaly detection; (d) parking lot handling.</figDesc><graphic coords="2,89.29,84.19,208.34,99.41" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: The proposed pipeline at work.</figDesc><graphic coords="2,302.62,215.64,208.34,71.60" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 :</head><label>3</label><figDesc>Figure 3: Execution time of the pipeline.</figDesc><graphic coords="2,319.94,567.61,166.67,51.69" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 4 :</head><label>4</label><figDesc>Figure 4: Visual results on a customized test.</figDesc><graphic coords="3,340.77,84.19,125.01,117.47" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 5 :</head><label>5</label><figDesc>Figure 5: Anomaly detection in a parking lot.</figDesc><graphic coords="4,89.29,84.19,208.35,148.52" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 6 :</head><label>6</label><figDesc>Figure 6: Pipeline components execution times.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head>Figure 7 :</head><label>7</label><figDesc>Figure 7: Visual results on test images. Masks are applied to the lots, each with a different color to better distinguish each instance. The associated probability score is printed on each mask.</figDesc><graphic coords="6,106.61,84.19,166.67,162.91" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc></figDesc><table><row><cell>Average Values</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This research was conducted as part of the Smart Energy Efficiency &amp; Environment for Industry (SE4I) project, CUP 𝐼 66𝐺18000230005, funded by "Progetti di ricerca industriale e lo Sviluppo sperimentale nelle 12 aree di specializzazione individuate nel PNR 2015-2020, di cui al D.D. del 13 luglio 2017 n. 1735".</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Adaptive quick reduct for feature drift detection</title>
		<author>
			<persName><forename type="first">A</forename><surname>Ferone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Maratea</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Algorithms</title>
		<imprint>
			<biblScope unit="volume">14</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Deep neural networks and explainable machine learning</title>
		<author>
			<persName><forename type="first">A</forename><surname>Maratea</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ferone</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">WILF 2018</title>
				<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="volume">11291</biblScope>
			<biblScope unit="page" from="253" to="256" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">License plate detection and recognition in unconstrained scenarios</title>
		<author>
			<persName><forename type="first">S</forename><surname>Montazzolli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Jung</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ECCV 2018</title>
				<imprint>
			<publisher>Springer Intl. Pub</publisher>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="593" to="609" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<monogr>
		<title level="m" type="main">Yolov4: Optimal speed and accuracy of object detection</title>
		<author>
			<persName><forename type="first">A</forename><surname>Bochkovskiy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">M</forename><surname>Liao</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2004.10934</idno>
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Ssd: Single shot multibox detector</title>
		<author>
			<persName><forename type="first">W</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Anguelov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Erhan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Szegedy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Reed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">C</forename><surname>Berg</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ECCV 2016</title>
				<imprint>
			<publisher>Springer Intl. Pub</publisher>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="21" to="37" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Jaderberg</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Simonyan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zisserman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Kavukcuoglu</surname></persName>
		</author>
		<idno>CoRR abs/1506.02025</idno>
		<title level="m">Spatial transformer networks</title>
				<imprint>
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">An overview of the tesseract ocr engine</title>
		<author>
			<persName><forename type="first">R</forename><surname>Smith</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ICDAR</title>
		<imprint>
			<biblScope unit="volume">2</biblScope>
			<biblScope unit="page" from="629" to="633" />
			<date type="published" when="2007">2007. 2007</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Long short-term memory</title>
		<author>
			<persName><forename type="first">S</forename><surname>Hochreiter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Neural computation</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="page" from="1735" to="1780" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Joint face detection and alignment using multitask cascaded convolutional networks</title>
		<author>
			<persName><forename type="first">K</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Qiao</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Signal Processing Letters</title>
		<imprint>
			<biblScope unit="volume">23</biblScope>
			<biblScope unit="page" from="1499" to="1503" />
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">One millisecond face alignment with an ensemble of regression trees</title>
		<author>
			<persName><forename type="first">V</forename><surname>Kazemi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sullivan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE Conference on Computer Vision and Pattern Recognition</title>
				<imprint>
			<date type="published" when="2014">2014. 2014</date>
			<biblScope unit="page" from="1867" to="1874" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Facenet: A unified embedding for face recognition and clustering</title>
		<author>
			<persName><forename type="first">F</forename><surname>Schroff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kalenichenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Philbin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</title>
				<imprint>
			<date type="published" when="2015">2015. 2015</date>
			<biblScope unit="page" from="815" to="823" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<monogr>
		<title level="m" type="main">Learning temporal regularity in video sequences</title>
		<author>
			<persName><forename type="first">M</forename><surname>Hasan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Choi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Neumann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">K</forename><surname>Roy-Chowdhury</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">S</forename><surname>Davis</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1604.04574</idno>
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><forename type="middle">S</forename><surname>Chong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><forename type="middle">H</forename><surname>Tay</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1701.01546</idno>
		<title level="m">Abnormal event detection in videos using spatiotemporal autoencoder</title>
				<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<title level="m" type="main">Openpose: Realtime multi-person 2d pose estimation using part affinity fields</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Hidalgo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Simon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S.-E</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sheikh</surname></persName>
		</author>
		<idno type="DOI">10.1109/TPAMI.2019.2929257</idno>
		<idno type="arXiv">arXiv:1812.08008</idno>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Long short-term memory</title>
		<author>
			<persName><forename type="first">S</forename><surname>Hochreiter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Neural Comput</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Nonlinear principal component analysis using autoassociative neural networks</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Kramer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Aiche Journal</title>
		<imprint>
			<biblScope unit="volume">37</biblScope>
			<biblScope unit="page" from="233" to="243" />
			<date type="published" when="1991">1991</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Real-time image-based parking occupancy detection using deep learning</title>
		<author>
			<persName><forename type="first">D</forename><surname>Acharya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Khoshelham</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Research@ Locate</title>
		<imprint>
			<biblScope unit="volume">4</biblScope>
			<biblScope unit="page" from="33" to="40" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<author>
			<persName><forename type="first">C</forename><surname>Zhou</surname></persName>
		</author>
		<title level="m">Yolact++ Better Real-Time Instance Segmentation</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
		<respStmt>
			<orgName>University of California, Davis</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Yolact: Realtime instance segmentation</title>
		<author>
			<persName><forename type="first">D</forename><surname>Bolya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><forename type="middle">J</forename><surname>Lee</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF international conference on computer vision</title>
				<meeting>the IEEE/CVF international conference on computer vision</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="9157" to="9166" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<title level="m" type="main">Focal loss for dense object detection</title>
		<author>
			<persName><forename type="first">T</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">B</forename><surname>Girshick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dollár</surname></persName>
		</author>
		<idno>CoRR abs/1708.02002</idno>
		<ptr target="http://arxiv.org/abs/1708.02002.arXiv:1708.02002" />
		<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
