<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">QualAI: Continuous Quality Improvement of AI-based Systems ⋆</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Nicole</forename><surname>Novielli</surname></persName>
							<email>nicole.novielli@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Bari</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Rocco</forename><surname>Oliveto</surname></persName>
							<email>rocco.oliveto@unimol.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Molise</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Fabio</forename><surname>Palomba</surname></persName>
							<email>fpalomba@unisa.it</email>
							<affiliation key="aff2">
								<orgName type="institution">University of Salerno</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Fabio</forename><surname>Calefato</surname></persName>
							<email>fabio.calefato@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Bari</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giuseppe</forename><surname>Colavito</surname></persName>
							<email>giuseppe.colavito@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Bari</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Vincenzo</forename><surname>De Martino</surname></persName>
							<email>vdemartino@unisa.it</email>
							<affiliation key="aff2">
								<orgName type="institution">University of Salerno</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Antonio</forename><forename type="middle">Della</forename><surname>Porta</surname></persName>
							<affiliation key="aff2">
								<orgName type="institution">University of Salerno</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giammaria</forename><surname>Giordano</surname></persName>
							<email>giagiordano@unisa.it</email>
							<affiliation key="aff2">
								<orgName type="institution">University of Salerno</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Emanuela</forename><surname>Guglielmi</surname></persName>
							<email>emanuela.guglielmi@unimol.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Molise</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Filippo</forename><surname>Lanubile</surname></persName>
							<email>filippo.lanubile@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Bari</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Luigi</forename><surname>Quaranta</surname></persName>
							<email>luigi.quaranta@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Bari</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Gilberto</forename><surname>Recupito</surname></persName>
							<email>grecupito@unisa.it</email>
							<affiliation key="aff2">
								<orgName type="institution">University of Salerno</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Simone</forename><surname>Scalabrino</surname></persName>
							<email>simone.scalabrino@unimol.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Molise</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Angelica</forename><surname>Spina</surname></persName>
							<email>a.spina5@studenti.unimol.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Molise</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Antonio</forename><surname>Vitale</surname></persName>
							<email>a.vitale8@studenti.unimol.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Molise</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">QualAI: Continuous Quality Improvement of AI-based Systems ⋆</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">82854F0D33243E43F592B6AA0A0F1172</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T18:49+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Software Engineering</term>
					<term>Machine Learning</term>
					<term>Quality Assurance</term>
					<term>Recommender Systems</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>QualAI is a two-year project that aims to define a set of recommenders to continuously monitor, assess, and improve the quality of AI-based systems, with a particular focus on ML-based systems. Quality assurance will be guaranteed from different perspectives and during both the development and operations phases. We will define recommenders for the quality assurance of both data and ML models to enable practitioners to mitigate technical debt. Emphasis will be given to communication issues that could arise in hybrid teams including data scientists and software developers. In this paper, we present the project outline, provide an executive summary of the research activities, and present the expected project results.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction and Motivation</head><p>In 2020, Google Health released an extremely accurate AI software for identifying diabetic retinopathy in pictures of patients' eyes. The classifier achieved over 90% accuracy and provided a diagnosis in less than 10 minutes. Unfortunately, when deployed for use in hospitals, the AIbased classifier experienced a drop in performance compared to the lab setting. Also, the system often failed to provide an outcome: being trained with high-resolution pictures, it discarded over one-fifth of images due to their low quality. This caused delays of up to months to obtain a diagnosis, resulting in complaints from patients <ref type="bibr" target="#b0">[1]</ref>. This accident shows how assessing performance in the lab might not be enough to ensure the quality of AI-based systems, as the success of a machine learning (ML) model does not consist exclusively of its accuracy. Special attention should be devoted to users' needs and context of action as well as to the integration of ML models with non-ML software as part of a large AI-based system, In a typical software system, given the requirements, the behavior is always specified by the developers. In an ML-based system, instead, data scientists define the operationalization of constructs playing a role in the addressed problem. Also, they build a training set, and identify the envisaged ML technique , which then defines the system behavior (ML models). Such systems require maintenance and quality assurance like any other system but special attention should be devoted to the typical issues affecting the quality of data and ML models <ref type="bibr" target="#b1">[2]</ref>. As such, assessing and improving the quality of ML-based systems presents unique challenges involving different aspects, which we discuss in the following. First, quality issues can be found in the ML models that the system uses to build it or its parameters, as well as in the data used for training them. For example, the historical data once used to train the ML model cannot be used blindly because they may become outdated and no longer reflect the status quo, due to a concept drift that might be occurring. Second, communication issues might arise as the teams working on ML-based systems are intrinsically heterogeneous. Several peculiar quality issues may arise, related, for example, to team communication and technological gaps, e.g., data scientists and software developers may use incompatible technologies. Finally, further issues might arise at the level of deployment and operations. The automated build process of some modules of an ML-based system and the construction of container images often require training one or more ML models. Specific quality issues may occur in this phase.</p><p>In essence, developers and data scientists are now confronted with the challenge of being more agile and adaptive. More specifically, new methods and strategies are needed for keeping ML-based systems responsive, monitored, and dependent on reliable variables. MLOps<ref type="foot" target="#foot_0">1</ref> is an ML engineering culture and practice that aims at dealing with the above challenges. MLOps unifies the ML system development (Dev) and ML system operation (Ops) advocating for automation and monitoring at all steps of ML system construction, including integration, testing, releasing, deployment, and infrastructure management, thus representing an umbrella for best practices and guiding principles around machine learning.</p><p>The above considerations motivate this project proposal. QualAI aims to define a set of recommenders that can be used to continuously monitor, assess, and improve the quality of AI-based systems, with a particular focus on ML-based systems. Quality assurance will be guaranteed from different perspectives and during both the development and operations phases. We will define recommenders for the quality assurance of both data and ML models. Results will allow practitioners to mitigate technical debt <ref type="bibr" target="#b1">[2]</ref>. Emphasis will be given to communication issues that could arise between data scientists and software developers. Finally, we will define approaches to (i) identify quality issues in the CI/CD pipeline; and (ii) monitor the quality of the system during the operations phase. QualAI will, both, facilitate the analysis of the recommendations (thanks to their explainability) and the planning of the corrective operations suggested by QualAI (thanks to the cost-effective analysis). A web platform integrating the recommenders for assessing the quality of ML-based systems will be released to produce quality badges summarizing the quality of a given AI-based system.</p><p>QualAI is a two-year project that has been funded in July 2023 by the European Union -NextGenerationEU through the PRIN 2022 call for projects of the Italian Ministry of University and Research for projects. In the following we provide a description of the research goals and an executive summary, by also positioning this research in the frame of related work.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Goals and Expected Results</head><p>Project Goal and Final Outcome The goal of QualAI is to define a set of recommenders to improve the quality of AI-based systems, in general, and of ML-systems, in particular, from different perspectives: data and ML models, ML integration, and deployment and operations. This goal will be achieved by monitoring the quality of the ML-based system during its whole life-cycle aiming at collecting useful information to automatically assess its level of quality. Once a quality issue, i.e., technical debt, has been identified, corrective operations will be suggested to remove the technical debt and improve the overall quality of the ML-based system.</p><p>All the QualAI recommendations will have a cost-effective and explainable connotation. They will be designed to rank the identified issues or the identified corrective operations based on the ratio between the potential costs that developers should spend to address the issue, e.g., change the ML model, and the potential benefits that their removal might provide to the overall quality of the ML-based system. Also, each recommendation is enriched with a human-readable explanation, in textual or visual form, see for instance Bellini et al. <ref type="bibr" target="#b2">[3]</ref>, providing the rationale behind the identified issues or the identified corrective actions. Such properties will increase the practitioners' confidence in the recommendations received and make informed decisions. The QualAI recommenders can be properly designed to be easily integrated into a Continuous Integration/Continuous Deployment (CI/CD) pipeline aiming at continuously improving the quality of ML-based systems.</p><p>The final outcome of the QualAI projects is represented by a set of recommenders able to assess and improve the quality of ML-based systems. Figure <ref type="figure" target="#fig_0">1</ref> shows the overall workflow of QualAI. The recommenders composing QualAI are activated when a developer or data scientist commits a change to the ML-based system. Then, QualAI analyzes the quality of the new version of the ML-based system from different perspectives: data and ML models, ML integration, and deployment and operations. The pipeline of QualAI recommenders can be easily integrated into the original CI/CD pipeline of the ML-based system to allow continuous quality assurance.</p><p>In this respect, QualAI also provides specific recommenders to optimize and improve the CI/CD pipeline. The quality assessment is performed by the Quality Assessment component of QualAI.</p><p>At the end of the analysis, the QualAI Quality Assessment component provides as output a set of quality badges (one for each quality dimension analyzed) that summarize the quality of the system and emphasize specific quality issues. These badges could be used by a project manager to simply analyze the quality of the system or as a support to certify that the ML system has a certain level of quality. The QualAI Quality Assessment component also provides information (e.g., the quality problem identified and its location) to the Quality Improvement component of QualAI. The Quality Improvement component is in charge to identify corrective actions (i.e., refactoring operations) aiming at removing the identified issues and thus improving the overall quality of the system. Each recommendation is accompanied by a description in a human-comprehensible format as well as an analysis of the cost-benefits for each proposed operation. Such analyses will facilitate the planning and the schedule of the proposed operations (e.g., the software analyst could decide to focus the attention on the most critical issues and postpone the others).</p><p>Both the Quality Assessment and Quality Improvement components rely on the monitoring framework of QualAI, i.e., the shared knowledge base which all the recommenders are based on. Such a knowledge base is continuously and automatically updated and contains resources internal (e.g., source code, ML models, training data, issues, logs, mailing lists, user reviews) to the ML system under analysis and external to the system (e.g., source code and related artifacts of other ML-based software projects, question and answer sites).</p><p>The accuracy of the QualAI recommenders will be empirically evaluated. We plan to conduct mixed-method research that combines (1) the mining of data science projects, which aims at establishing the accuracy of the recommenders; and (2) survey-and interview-based studies with developers to get feedback on the effectiveness of the proposed recommenders. In the context of the study, we will define guidelines for conducting such empirical studies and for creating and sharing replication packages. We also plan to apply the QualAI recommenders on a set of industrial software systems and involve practitioners by exploiting the collaboration with our industrial partners.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Objectives and Expected Results</head><p>To the overall goal of our research project, we will address the following objectives.</p><p>• OB1: Definition of a monitoring framework for knowledge management. As a shared preliminary objective, we will define what data sources should be considered and what formats should be used to represent the data. More specifically, in this project we will analyze developers' communication (e.g., on collaboration platforms), user feedback (e..g, through application reviews), source code and notebooks, and build logs through continuous integration tools, application logs from monitoring tools. The commonly used ML process models will be reviewed and synthesized as a preliminary step, to ensure that the approaches defined as an outcome of the other objectives provide adequate support for most of the realistic application scenarios of ML-based systems. The expected result of this activity is a common framework that will be used in all the following phases. • OB2: Definition of approaches for assessing and improving the quality of data and ML models. We will consider the causes leading to the degradation of several properties of ML systems, including robustness, efficiency, privacy, interpretability, fairness, and reproducibility. As a result, we plan to build a comprehensive catalog of the issues affecting the above-mentioned properties as well as the mitigation strategies that can improve them. To this aim, we will define novel approaches to identify issues in the data used for training the models, in the machine learning techniques used to build them, and in their configuration, based on, both, static and dynamic analysis. In this respect, we plan to propose recommenders that balance the cost needed to address the issues identified and the associated effectiveness. Also, all the recommendations will be explainable, in an effort of facilitating the identification of more critical issues to address.</p><p>The second expected result is a set of cost-effective recommendation techniques that can automatically improve data and model quality. • OB3: Definition of approaches for assessing and improving the quality of the integration between the underlying ML models and the rest of the system. We will focus on several relevant aspects, including team communication, technical gap, and system security. The first expected result is a set of cost-effective techniques that can automatically detect quality issues at integration and system level. To achieve this goal, we will define novel approaches for detecting quality issues both in the integration (process-oriented) and in the resulting system (product-oriented). Such approaches will be mostly based on static analysis techniques (e.g., detection of community and code smells). Finally, novel approaches will be defined for automatically improving the quality of the integration (e.g., techniques for automatically adapting the technologies used by data scientists to production-ready code) and of the resulting system (e.g., ML-based system-specific refactoring operations). We also plan to devise approaches based on data-driven techniques, which will still follow an explainable and cost-effective philosophy. The second expected result is a set of cost-effective approaches to recommend operations for fixing the quality issues at the ML integration level. • OB4: Definition of approaches for assessing and improving the quality of deployment and operation of ML-based systems. We will focus on the CI/CD philosophy and, specifically, on the configuration of the pipelines for building the final product and checking its quality. Indeed, suboptimal configurations of such pipelines may hinder the quality of the final product. We will also focus on virtualization and/or containerization and, specifically, on the composition of the images describing the execution environments of the system components. Finally, we will focus on the software log quality. The first expected result is a set of techniques that can automatically detect quality issues at the deployment and operation levels. To achieve this goal, we will define novel approaches based on static analysis techniques (e.g., detection of configuration smells for Docker files) and dynamic analysis techniques (e.g., analysis of the execution logs). The second expected result is a set of cost-effective approaches that can recommend operations to fix the quality issues at the deployment and operation levels. Especially, new approaches will be defined for automatically improve the quality of deployment and operation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">State of the Art</head><p>In the following, we overview the literature on the three pillars of QualAI.</p><p>Data and ML Models Studies were conducted to describe issues affecting data and ML model quality. Sculley et al. <ref type="bibr" target="#b3">[4,</ref><ref type="bibr" target="#b1">2]</ref> identified design issues that threaten robustness, relevance, and efficiency. Recently, taxonomies and causes of bugs for deep learning applications were also developed <ref type="bibr" target="#b4">[5,</ref><ref type="bibr" target="#b5">6]</ref>. Bugs were generally related to wrong configuration of ML models, which impacts their robustness, or to misinterpretation of the ML model, leading data scientists to not understand its predictions. Zhang et al. <ref type="bibr" target="#b6">[7]</ref> elicited open challenges in ML testing showing that the most critical issues affecting the reliability of ML systems concern their robustness, fairness, and correctness. Brun and Meliou <ref type="bibr" target="#b7">[8]</ref> urged SE researchers to address the challenges of designing fair software. Further studies <ref type="bibr" target="#b8">[9,</ref><ref type="bibr" target="#b9">10,</ref><ref type="bibr" target="#b10">11]</ref> described the challenges of reproducing computational notebooks, i.e., tools designed to make data analysis easier to document and reproduce. Recent studies proposed tools to detect anomalies or inefficiencies in datasets before feeding them into ML pipelines <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13]</ref>. All these studies highlight the importance of data and model quality for building successful AI systems. However, the few available studies represent a call for further research on investigating quality issues related to AI systems and defining recommenders to improve their overall quality.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>ML Integration</head><p>Quality assurance of ML integration is challenging due to the different backgrounds of data scientists, who build ML models, and software developers, who make the ML models available in the system <ref type="bibr" target="#b13">[14]</ref>. Recommenders were proposed to detect such social smells that occur, for example, when communication lacks between teams working on different system components <ref type="bibr" target="#b14">[15]</ref>. Sculley et al. <ref type="bibr" target="#b1">[2]</ref> highlighted that cultural debt may arise when teams with different skills collaborate, and process management debt may accrue when many ML models are run in the same system, leading to problems with resource management and the model maintenance. Kim (2020) described roles and responsibilities that different stakeholders should have when debugging and testing ML models at different development stages. Zhang et al. <ref type="bibr" target="#b6">[7]</ref> highlighted that security problems in ML systems may appear not only in the model in isolation but also in the integration with the rest of the system. Indeed, ML systems can be vulnerable to unique attacks, such as model stealing or data poisoning, which might compromise their integrity and confidentiality. The literature mostly focuses on understanding issues related to ML integration quality. Only recently, researchers have started investigating communication issues in multidisciplinary teams for AI-based software development <ref type="bibr" target="#b15">[16]</ref>. We plan to further investigate communication challenges in the development of ML systems.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Deployment and Operations</head><p>A few studies investigated how to appropriately deploy AI systems, especially concerning how to set up CI/CD pipelines. Recent research pointed out the need for ML-specific pipelines that consider common needs, like the availability of models with good accuracy or suitable training data -thus supporting the idea of establishing quality control mechanisms for ML systems. Karlas et al. <ref type="bibr" target="#b16">[17]</ref> defined a tool for integrating ML tools within existing CI/CD pipelines. Humbatova et al. <ref type="bibr" target="#b5">[6]</ref>identified further issues related to model configuration, e.g., API-related issues, which call for additional tools. Cito et al. <ref type="bibr" target="#b17">[18]</ref> analyzed common quality issues of Dockerfiles in open-source projects, while Wu et al. <ref type="bibr" target="#b18">[19]</ref> defined a proper catalog of configuration smells for such files. No previous studies specifically addressed the problem of quality assurance for ML system containerization. The literature does not provide enough support to specialists in properly deploying ML systems. We also found no techniques for monitoring ML systems in production.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: The Workflow of the QualAI framework.</figDesc><graphic coords="3,89.29,84.19,416.67,232.07" type="bitmap" /></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">MLOps, https://ml-ops.org,</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2020" xml:id="foot_1"></note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This research was funded by the European Union -NextGenerationEU through the Italian Ministry of University and Research, Projects PRIN 2022 ("QualAI: Continuous Quality Improvement of AI-based Systems", grant n. 2022B3BP5S, CUP: H53D23003510006).</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Vardoulakis, A human-centered evaluation of a deep learning system deployed in clinics for the detection of diabetic retinopathy</title>
		<author>
			<persName><forename type="first">E</forename><surname>Beede</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Baylor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Hersch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Iurchenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wilcox</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Ruamviboonsuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">M</forename></persName>
		</author>
		<idno type="DOI">10.1145/3313831.3376718</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 2020 CHI Conf. on Human Factors in Computing Systems, CHI &apos;20</title>
				<meeting>of the 2020 CHI Conf. on Human Factors in Computing Systems, CHI &apos;20</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="1" to="12" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Hidden technical debt in machine learning systems</title>
		<author>
			<persName><forename type="first">D</forename><surname>Sculley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Holt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Golovin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Davydov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Phillips</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Ebner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Young</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-F</forename><surname>Crespo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dennison</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 28th Int&apos;l Conf. on Neural Information Processing Systems -Volume 2, NIPS&apos;15</title>
				<meeting>of the 28th Int&apos;l Conf. on Neural Information essing Systems -Volume 2, NIPS&apos;15</meeting>
		<imprint>
			<publisher>MIT Press</publisher>
			<date type="published" when="2015">2015</date>
			<biblScope unit="page" from="2503" to="2511" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Knowledge-aware autoencoders for explainable recommender systems</title>
		<author>
			<persName><forename type="first">V</forename><surname>Bellini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Schiavone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Di Noia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ragone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">Di</forename><surname>Sciascio</surname></persName>
		</author>
		<idno type="DOI">10.1145/3270323.3270327</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 3rd Workshop on Deep Learning for Recommender Systems, DLRS 2018</title>
				<meeting>of the 3rd Workshop on Deep Learning for Recommender Systems, DLRS 2018</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="24" to="31" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Machine learning: The high interest credit card of technical debt</title>
		<author>
			<persName><forename type="first">D</forename><surname>Sculley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Holt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Golovin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Davydov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Phillips</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Ebner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Young</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">SE4ML: Software Engineering for Machine Learning (NIPS 2014 Workshop)</title>
				<imprint>
			<date type="published" when="2014">2014</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">An empirical study on tensorflow program bugs</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S.-C</forename><surname>Cheung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xiong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zhang</surname></persName>
		</author>
		<idno type="DOI">10.1145/3213846.3213866</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 27th ACM SIGSOFT Int&apos;l Symp. on Software Testing and Analysis, ISSTA 2018</title>
				<meeting>of the 27th ACM SIGSOFT Int&apos;l Symp. on Software Testing and Analysis, ISSTA 2018</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="129" to="140" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Taxonomy of real faults in deep learning systems</title>
		<author>
			<persName><forename type="first">N</forename><surname>Humbatova</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Jahangirova</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Bavota</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Riccio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Stocco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Tonella</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE/ACM 42nd Int&apos;l Conf. on Software Engineering (ICSE)</title>
				<imprint>
			<date type="published" when="2019">2020. 2019</date>
			<biblScope unit="page" from="1110" to="1121" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Machine learning testing: Survey, landscapes and horizons</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Harman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<idno type="DOI">10.1109/TSE.2019.2962027</idno>
	</analytic>
	<monogr>
		<title level="j">IEEE Trans. on Softw. Eng</title>
		<imprint>
			<biblScope unit="volume">48</biblScope>
			<biblScope unit="page" from="1" to="36" />
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Software fairness</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Brun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Meliou</surname></persName>
		</author>
		<idno type="DOI">10.1145/3236024.3264838</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 2018 26th ACM Joint Meeting on European Software Engineering Conf. and Symposium on the Foundations of Software Engineering, ESEC/FSE 2018</title>
				<meeting>of the 2018 26th ACM Joint Meeting on European Software Engineering Conf. and Symposium on the Foundations of Software Engineering, ESEC/FSE 2018</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="754" to="759" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Understanding and improving the quality and reproducibility of jupyter notebooks</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">F</forename><surname>Pimentel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Murta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Braganholo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Freire</surname></persName>
		</author>
		<idno type="DOI">10.1007/s10664-021-09961-9</idno>
	</analytic>
	<monogr>
		<title level="j">Empirical Software Engineering</title>
		<imprint>
			<biblScope unit="volume">26</biblScope>
			<biblScope unit="page">65</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">What&apos;s wrong with computational notebooks? pain points, needs, and design opportunities</title>
		<author>
			<persName><forename type="first">S</forename></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Prasad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">Z</forename><surname>Henley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sarma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Barik</surname></persName>
		</author>
		<idno type="DOI">10.1145/3313831.3376729</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 2020 CHI Conf. on Human Factors in Computing Systems, CHI &apos;20</title>
				<meeting>of the 2020 CHI Conf. on Human Factors in Computing Systems, CHI &apos;20</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="1" to="12" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Assessing and restoring reproducibility of jupyter notebooks</title>
		<author>
			<persName><forename type="first">J</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>-Y. Kuo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zeller</surname></persName>
		</author>
		<idno type="DOI">10.1145/3324884.3416585</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 35th IEEE/ACM Int&apos;l Conf. on Automated Software Engineering, ASE &apos;20</title>
				<meeting>of the 35th IEEE/ACM Int&apos;l Conf. on Automated Software Engineering, ASE &apos;20</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="138" to="149" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Data validation for machine learning</title>
		<author>
			<persName><forename type="first">E</forename><surname>Breck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Polyzotis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Roy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Whang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zinkevich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. of MLSys 2019</title>
				<editor>
			<persName><forename type="first">A</forename><forename type="middle">T</forename></persName>
		</editor>
		<meeting>of MLSys 2019</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">The data linter: Lightweight, automated sanity checking for ml data sets</title>
		<author>
			<persName><forename type="first">N</forename><surname>Hynes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Sculley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Terry</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">NIPS MLSys Workshop</title>
				<imprint>
			<date type="published" when="2017">2017</date>
			<biblScope unit="volume">1</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Collaboration challenges in building ml-enabled systems: communication, documentation, engineering, and process</title>
		<author>
			<persName><forename type="first">N</forename><surname>Nahar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Kästner</surname></persName>
		</author>
		<idno type="DOI">10.1145/3510003.3510209</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 44th Int&apos;l Conf. on Software Engineering, ICSE &apos;22</title>
				<meeting>of the 44th Int&apos;l Conf. on Software Engineering, ICSE &apos;22</meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="413" to="425" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Discovering community patterns in open-source: a systematic approach and its evaluation</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">A</forename><surname>Tamburri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Palomba</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Serebrenik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zaidman</surname></persName>
		</author>
		<idno type="DOI">10.1007/s10664-018-9659-9</idno>
	</analytic>
	<monogr>
		<title level="j">Empirical Softw. Engg</title>
		<imprint>
			<biblScope unit="volume">24</biblScope>
			<biblScope unit="page" from="1369" to="1417" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">How ai developers overcome communication challenges in a multidisciplinary team: A case study</title>
		<author>
			<persName><forename type="first">D</forename><surname>Piorkowski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Park</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Muller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Portnoy</surname></persName>
		</author>
		<idno type="DOI">10.1145/3449205</idno>
	</analytic>
	<monogr>
		<title level="j">Proc. ACM Hum.-Comput. Interact</title>
		<imprint>
			<biblScope unit="volume">5</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Building continuous integration services for machine learning</title>
		<author>
			<persName><forename type="first">B</forename><surname>Karlaš</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Interlandi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Renggli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Mukunthu Iyappan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Babu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Edwards</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lauren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><surname>Weimer</surname></persName>
		</author>
		<idno type="DOI">10.1145/3394486.3403290</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 26th ACM SIGKDD Int&apos;l Conf. on Knowledge Discovery &amp; Data Mining, KDD &apos;20</title>
				<meeting>of the 26th ACM SIGKDD Int&apos;l Conf. on Knowledge Discovery &amp; Data Mining, KDD &apos;20<address><addrLine>New York, NY, USA</addrLine></address></meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="2407" to="2415" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">An empirical analysis of the docker container ecosystem on github</title>
		<author>
			<persName><forename type="first">J</forename><surname>Cito</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Schermann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Wittern</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Leitner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zumberi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">C</forename><surname>Gall</surname></persName>
		</author>
		<idno type="DOI">10.1109/MSR.2017.67</idno>
	</analytic>
	<monogr>
		<title level="m">IEEE/ACM 14th Int&apos;l Conf. on Mining Software Repositories (MSR)</title>
				<imprint>
			<date type="published" when="2017">2017. 2017</date>
			<biblScope unit="page" from="323" to="333" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Characterizing the occurrence of dockerfile smells in open-source software: An empirical study</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
		<idno type="DOI">10.1109/ACCESS.2020.2973750</idno>
	</analytic>
	<monogr>
		<title level="j">IEEE Access</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<biblScope unit="page" from="34127" to="34139" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
