<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Dataversifying Natural Sciences: Pioneering a Data Lake Architecture for Curated Data-Centric Experiments in Life &amp; Earth Sciences</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Genoveva</forename><surname>Vargas-Solar</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">CNRS</orgName>
								<orgName type="institution" key="instit2">Univ. Lyon</orgName>
								<orgName type="institution" key="instit3">INSA Lyon</orgName>
								<orgName type="institution" key="instit4">UCBL</orgName>
								<orgName type="institution" key="instit5">LIRIS</orgName>
								<address>
									<postCode>UMR5205, F-69221</postCode>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Jérôme</forename><surname>Darmont</surname></persName>
							<email>jerome.darmont@univ-lyon2.fr</email>
							<affiliation key="aff1">
								<orgName type="institution">Université de Lyon</orgName>
								<address>
									<settlement>Lyon</settlement>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution" key="instit1">UR</orgName>
								<orgName type="institution" key="instit2">ERIC France</orgName>
								<address>
									<addrLine>5 avenue Mendès</addrLine>
									<postCode>69676</postCode>
									<settlement>Bron Cedex</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Alejandro</forename><surname>Adorjan</surname></persName>
							<email>aadorian@gmail.com</email>
							<affiliation key="aff4">
								<orgName type="institution">Unversidad ORT</orgName>
								<address>
									<settlement>Montevideo</settlement>
									<country key="UY">Uruguay</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Javier</forename><forename type="middle">A</forename><surname>Espinosa-Oviedo</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">CNRS</orgName>
								<orgName type="institution" key="instit2">Univ. Lyon</orgName>
								<orgName type="institution" key="instit3">INSA Lyon</orgName>
								<orgName type="institution" key="instit4">UCBL</orgName>
								<orgName type="institution" key="instit5">LIRIS</orgName>
								<address>
									<postCode>UMR5205, F-69221</postCode>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="institution">CPE Lyon</orgName>
								<address>
									<addrLine>43 Blvd. du 11 Novembre 1918</addrLine>
									<postCode>69616</postCode>
									<settlement>Villeurbanne Cedex</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Carmem</forename><surname>Hara</surname></persName>
							<email>carmemhara@ufpr.br</email>
							<affiliation key="aff5">
								<orgName type="department">Dept. de Informatica</orgName>
								<orgName type="institution">Universidade Federal do Paranà</orgName>
								<address>
									<postCode>81531-980</postCode>
									<settlement>Curitiba -PR</settlement>
									<country key="BR">Brazil</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sabine</forename><surname>Loudcher</surname></persName>
							<email>sabine.loudcher@univ-lyon2.fr</email>
							<affiliation key="aff1">
								<orgName type="institution">Université de Lyon</orgName>
								<address>
									<settlement>Lyon</settlement>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution" key="instit1">UR</orgName>
								<orgName type="institution" key="instit2">ERIC France</orgName>
								<address>
									<addrLine>5 avenue Mendès</addrLine>
									<postCode>69676</postCode>
									<settlement>Bron Cedex</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Regina</forename><surname>Motz</surname></persName>
							<email>rmotz@fing.edu.uy</email>
							<affiliation key="aff6">
								<orgName type="department">Instituto de Computación (INCO) Facultad de Ingeniería</orgName>
								<orgName type="institution">Universidad de la Repúbica</orgName>
								<address>
									<country key="UY">Uruguay</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Martin</forename><surname>Musicante</surname></persName>
							<affiliation key="aff7">
								<orgName type="institution">Universidad Federal Rio Grande do Norte</orgName>
								<address>
									<settlement>DIMAP</settlement>
									<region>Natal</region>
									<country key="BR">Brazil</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Luis</forename><surname>Zechinelli-Martini</surname></persName>
							<affiliation key="aff8">
								<orgName type="institution">Fundación Universidad de las Américas</orgName>
								<address>
									<addrLine>Puebla Exhacienda Sta. Catarina Mártir s/n</addrLine>
									<postCode>72820</postCode>
									<settlement>San Andrés Cholula</settlement>
									<country key="MX">Mexico</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff9">
								<address>
									<settlement>Paestum</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff10">
								<orgName type="institution">Genoveva Vargas-Solar</orgName>
							</affiliation>
						</author>
						<title level="a" type="main">Dataversifying Natural Sciences: Pioneering a Data Lake Architecture for Curated Data-Centric Experiments in Life &amp; Earth Sciences</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">08D302BA04BE4AD5FD523B651A1E5203</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:17+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Life and Earth sciences</term>
					<term>data-driven experiments</term>
					<term>data lake</term>
					<term>data curation</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This vision paper introduces a pioneering data lake architecture designed to meet Life &amp; Earth sciences' burgeoning data management needs. As the data landscape evolves, the imperative to navigate and maximise scientific opportunities has never been greater. Our vision paper outlines a strategic approach to unify and integrate diverse datasets, aiming to cultivate a collaborative space conducive to scientific discovery. The core of the design and construction of a data lake is the development of formal and semi-automatic tools, enabling the meticulous curation of quantitative and qualitative data from experiments. Our unique "research-in-the-loop" methodology ensures that scientists across various disciplines are integrally involved in the curation process, combining automated, mathematical, and manual tasks to address complex problems, from seismic detection to biodiversity studies. By fostering reproducibility and applicability of research, our approach enhances the integrity and impact of scientific experiments. This initiative is set to improve data management practices, strengthening the capacity of Life &amp; Earth sciences to solve some of our time's most critical environmental and biological challenges.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>These days, it is relatively easy and inexpensive to acquire massive amount of data, even in continuous mode. This has been no different for experimental and observational sciences like Life &amp; Earth sciences. Accessibility to data about the Earth and its biodiversity, with varying levels of provenance, quality and reliability, opens up the possibility of constructing different perspectives on the phenomena observed, leading to scientific conclusions with different depths that target a wide range of knowl-edge consumers (civilians, decision-makers, scientists).</p><p>Traditional schema-on-write approaches, such as the Extraction, Transformation and Loading (ETL) process, are ineffective for the data management requirements of these experimental sciences. Data lakes are becoming increasingly common for the management and analysis of massive data. Data lakes are repositories that store raw data in its original format. They can be well adapted for storing data harvested from digital sources (observation stations), social media, Web and in situ collectors.</p><p>The extraction of value through data-driven experiments in the Life &amp; Earth sciences is determined by two main elements:</p><p>• The maintenance of metadata gathering the conditions under which experiments are performed (quantitative perspective) to preserve the memory of the experimental process of knowledge production process, and to enable understanding and reproducibility. • An open science perspective that can go beyond data sharing and must consider the sharing of know-how, decision-making, expertise, project management, and people within the projects that define the research must be considered.</p><p>This vision paper introduces our approach to designing and building a data lake for collecting and integrating data and meta data of Life &amp; Earth sciences' data-driven experiments.</p><p>The remainder of the paper is organised as follows. Section 2 gives a general overview of approaches that address curating and managing knowledge in Life &amp; Earth sciences. Section 3 describes the challenges associated with curating data and data-driven experiments in Life &amp; Earth sciences often guided by researchers. In particular, the section gives the general challenges for building data lakes containing curated data and producing knowledge derived from data-driven experiments. Section 4 introduces the general principle for building, maintaining and exploiting a data lake. The data lake allows the creation of "dataverses" that can export the history of the development of experimental processes that lead to knowledge in Life &amp; Earth sciences. Finally, Section 5 concludes the paper and discusses future work.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related work</head><p>We introduce the main topics and approaches that underline the vision of maintaining and sharing data to perform data-driven experiments: data harvesting tools, data curation techniques, data labs, data lakes, science lakes and dataverses.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Data harvesting</head><p>Data available on the Web play a determining role in decision-making in personal and corporate life. Collecting and storing this data in a structured model helps integrate them with other sources and use the dataset in various applications, such as event detection and sentiment monitoring. Online newspapers are essential sources of information, accessed daily by thousands of people.</p><p>Various works in the literature report manual efforts to extract data from pages on the Web <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>. However, these efforts have been eased by applying Web scraping techniques. Some work complements automated extraction processes to obtain clean and analysed data by implementing curation procedures <ref type="bibr" target="#b2">[3]</ref>. Among the various existing tools available on the Web for data extraction, we can highlight ParseHub<ref type="foot" target="#foot_0">1</ref> is a web scraping tool that facilitates data extraction from websites through an interactive click-based interface, saving the data directly to the cloud in JSON and CSV formats. It navigates through continuation pages and captures complete news articles, with the ability to collect data based on specific character sequences. 80legs<ref type="foot" target="#foot_1">2</ref> offers sequential data extraction from websites. Octoparse<ref type="foot" target="#foot_2">3</ref> simplifies the data extraction process by enabling users to create a scraping workflow with clicks. It includes features like URL and string lists for targeted scraping and ready-to-use templates for popular sites like Amazon and Google. FactExtract <ref type="bibr" target="#b2">[3]</ref> is tailored for aggregating content from specific Senegalese news sources, boasting automatic language detection for ten languages, data cleaning, and analysis, all whilst avoiding data duplication. This tool, which utilises Python's Newspaper library, also features automated daily updates for the news content it monitors. ENoW -News Data Extractor from the Web<ref type="foot" target="#foot_3">4</ref> is a news scrapping system that explores online newspapers. ENoW receives search strings as input and stores in a relational database data extracted from the news and their full content.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Data curation</head><p>According to Garcov et al., <ref type="bibr" target="#b3">[4]</ref>, research data curation is "preparing research data and artefacts for sharing and long-term preservation". Research repositories are the standard for publishing data collections to the research communities. Datasets at an early collection stage are generally not ready for analysis or preservation. Thus, extensive preprocessing, cleaning, transformation, and documentation actions are required to support usability, sharing, and preservation over time <ref type="bibr" target="#b4">[5]</ref>. Curated data collections have the potential to drive scientific progress <ref type="bibr" target="#b5">[6]</ref>, are relevant for reproducibility and improve the reliability of sciences <ref type="bibr" target="#b6">[7]</ref>. However, data curation introduces challenges for supporting data-driven applications <ref type="bibr" target="#b7">[8]</ref> adopting quanti-qualitative methods. For example, research challenges curating material across time, space and collaborators <ref type="bibr" target="#b6">[7]</ref>. Quantitative and qualitative research methodologies apply ad-hoc data curation strategies that keep track of the data that describe the tools, techniques, hypothesis, and data harvesting criteria defined a priori by a scientific team.</p><p>Several software tools that apply statistical techniques and machine learning algorithms are available for qualitative researchers. Woods et al. <ref type="bibr" target="#b8">[9]</ref> argue that Computer-Assisted Qualitative Data Analysis Software (CAQDAS) is a well-known tool for qualitative research. These tools support qualitative techniques and methods for applying Qualitative Data Analysis (QDA). ATLAS.ti <ref type="bibr" target="#b9">[10]</ref>, Dedoose <ref type="bibr" target="#b10">[11]</ref>, MAXQDA <ref type="bibr" target="#b11">[12]</ref>, NVivo <ref type="bibr" target="#b12">[13]</ref> implement the mat. CAQDAS <ref type="bibr" target="#b13">[14]</ref> researchers and practitioners can perform annotation, labelling, querying, audio and video transcription, pattern discovery, and report generation. Furthermore, CAQDAS tools allow the creation of field notes, thematic coding, search for connections, memos (thoughtful comments), contextual analysis, frequency analysis, word location and data analysis presentation in different reporting formats <ref type="bibr" target="#b14">[15]</ref>. The REFI-QDA (Rotterdam Exchange Format Initiative) <ref type="foot" target="#foot_4">5</ref> the standard allows the exchange of qualitative data to enable reuse in QDAS <ref type="bibr" target="#b15">[16]</ref>. QDA software such as ATLAS.ti <ref type="bibr" target="#b9">[10]</ref>, Dedoose <ref type="bibr" target="#b10">[11]</ref>, MAXQDA <ref type="bibr" target="#b11">[12]</ref>, NVivo <ref type="bibr" target="#b12">[13]</ref>, QDAMiner <ref type="bibr" target="#b16">[17]</ref>, Quirkos <ref type="bibr" target="#b17">[18]</ref> and Transana <ref type="bibr" target="#b18">[19]</ref> adopt REFI-QDA standard.</p><p>We assume that data curation consists of identifying, systematizing, managing, and versioning research data, considering versioning artefacts an essential component of tracking changes along the research project.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Data labs</head><p>Data science environments provide data labs like Kaggle <ref type="foot" target="#foot_5">6</ref> and Dryad<ref type="foot" target="#foot_6">7</ref> with stacks of services for (externalised) data storage, tagging and exploring tools. These environments allow a collective sharing space of highly curated data collection maintenance tools. There are specialised repositories like DataOne<ref type="foot" target="#foot_7">8</ref> and data repositories re3data <ref type="foot" target="#foot_8">9</ref> .</p><p>DataONE (Data Observation Network for Earth) is a community-driven project that provides access to various environmental and ecological data across multiple member repositories. It is designed as an innovative framework aimed at facilitating research and enabling scientists and researchers to preserve, access, use, and increase the impact of their data. The platform provides robust data management tools, ensuring datasets' preservation and integrity. DataONE underscores data stewardship as a federated resource and supports scientific collaboration and reproducibility. It is invaluable for researchers seeking to address complex environmental challenges through shared data and knowledge.</p><p>Re3data is a global registry of research data repositories that offers a comprehensive directory for researchers seeking to access, store, share, and manage their datasets. It represents a variety of academic disciplines and provides detailed information about each repository, such as access policies, standards, and contact details. re3data promotes data sharing, visibility, and reuse as a critical reference point for finding suitable repositories for data deposition. The platform enhances transparency in research data management. It supports open science by guiding users to trustworthy and reliable repositories, thereby facilitating the discovery of high-quality data across different scientific fields.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4.">Data lake, science lake and dataverse</head><p>Data lakes are expansive storage repositories that hold vast raw data in their native format until needed. Stein and Morrison <ref type="bibr" target="#b19">[20]</ref> emphasised their potential for scalability and flexibility in handling big data from various sources. In recent studies, Dixon in 2010 <ref type="foot" target="#foot_9">10</ref> defined the term and its initial application in big data analytics. Quix et al. (2016) <ref type="bibr" target="#b20">[21]</ref> delved into the architectural considerations and challenges such as data governance and metadata management.</p><p>Science lakes, an offshoot of data lakes, are tailored specifically for the scientific community to address the need for interdisciplinary research, data management and complex analytics. Russom (2016) <ref type="bibr" target="#b21">[22]</ref> suggested that science lakes provide a more discipline-specific approach to data handling, enabling better metadata curation and domain-specific data models, which are crucial for reproducibility in scientific research.</p><p>A data lake is a vast storage system that houses extensive volumes of raw data in its original format. This versatile system accommodates a range of data types, including structured, semi-structured, and unstructured forms. Data lakes are essential in environments focused on big data analytics and are designed to manage data characterised by large volume, high velocity, and diverse variety from multiple sources. They are commonly utilised for advanced data processing activities such as machine learning and predictive analytics. Unlike traditional databases following the schema-on-write approach, data lakes follow the schema-on-read approach, providing flexibility in how data is formatted and used.</p><p>Dataverse. The concept of dataverse takes the notion of data lakes further by creating a networked space where data is stored, actively managed, and shared within the scientific community. A dataverse is a data repository platform for publishing, citing, and discovering datasets. It enables researchers to publish, cite, and discover datasets while providing metadata and tools to ensure others can understand and use data. Dataverses are often domain-specific and support the principles of open science, providing features such as data version control, digital object identifiers (DOIs) for citation, and tools for data analysis within the platform. They are community-driven and emphasize the accessibility and reusability of research data.</p><p>The most prominent example is the open-source Dataverse project developed by the Institute for Quantitative Social Science at Harvard University. The Dataverse Project, initiated by King <ref type="bibr" target="#b22">[23]</ref>, provides an open-source platform for sharing, preserving, citing, exploring, and analysing research data. It focuses on data citation and reproducibility, as discussed by Crosas <ref type="bibr" target="#b23">[24]</ref>, who highlighted the platform's role in fostering collaboration and open science.</p><p>Different academic institutions have built their dataverses for sharing and disseminating experimental scientific results, including the data collections they curate: University of Arizona <ref type="foot" target="#foot_10">11</ref> , the Different universities and academic institutions have promoted their dataverses like the University of Hamburg <ref type="foot" target="#foot_11">12</ref> , the University of Michigan <ref type="foot" target="#foot_12">13</ref> and the Grenoble Dataverse <ref type="foot" target="#foot_13">14</ref> .</p><p>Summary. Together, these systems represent a shift toward more open, integrated, and efficient ecosystems for data management, offering novel solutions to the challenges posed by the vast amounts of data generated in modern research. They move away from traditional databases and toward more fluid, dynamic systems that can accommodate the ever-changing landscape of big data and scientific research.</p><p>A dataverse and a data lake are concepts related to data storage and management but serve different purposes and are designed with varying cases of use in mind. While a dataverse is a scholarly platform aimed at curating, sharing, and preserving research data with rich metadata and community collaboration features, a data lake is a more generalised and scalable storage solution for raw data to support diverse data analytics and processing workflows.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.5.">Data lakes and data verses in Life &amp; Earth sciences</head><p>Dataverses in Life &amp; Earth sciences are specialised digital infrastructures designed to address specific data management needs for these scientific domains. They provide a structured yet flexible environment where datasets can be stored, accessed, shared, and analysed. These dataverses typically offer robust metadata standards and tools to ensure their data are well-described, making them discoverable and usable for various research purposes. In Life Sciences, dataverses often focus on genomics, proteomics, clinical trials, and other biological data, integrating various sources of information to aid in complex analyses like phenotype-genotype correlations. For Earth Sciences, dataverses might concentrate on geospatial data, climate models, seismic activity records, and ecological data, supporting efforts to understand and model the Earth's dynamic systems.</p><p>These repositories support open science by promoting data sharing across disciplinary boundaries. This feature enables researchers to replicate studies and build upon existing work, which is fundamental for advancing knowledge. They also facilitate interdisciplinary collaboration, allowing experts from different fields to contribute to and draw from a collective data pool. For instance, a dataverse in these fields might include a combination of high-throughput experimental data, field observations, and simulation outputs. The combination of openness and rigorous data management positions dataverses as critical resources in pursuing scientific discovery in Life &amp; Earth sciences.</p><p>In life and earth sciences, data lakes are pivotal for consolidating scientific data collected from various biodiversity studies and geological events like earthquakes. Once curated, processed, and analysed, this data contributes significantly to data-driven experiments underpinned by well-established protocols. The harvested data enriches the data lake and supports the creation of detailed, curated views for dissemination through dataverses.</p><p>Our vision emphasises the importance of developing and maintaining data lakes with partially curated content in life and earth sciences, facilitating the continuous cycle of experimental data feeding back into the lake and subsequently sharing via dataverses.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Maintaining and sharing earth and life sciences knowledge: challenges</head><p>Various data on life and earth sciences have been acquired from different sources <ref type="bibr" target="#b24">[25]</ref>. Integrated access to data collections and their curated versions can facilitate their maintenance, analysis and experimentation. It can also demonstrate knowledge of the discipline with its vocabulary, concepts and relationships in a synthetic way. Curation, maintenance and exploration of data collections in the data lake calls for proposing techniques for exploring data collections that can be explored and enriched while producing new data and analytical results. Data curation also means keeping track of the type of experiments carried out on the data, their results and the conditions under which they were carried out. Maintaining a catalogue of data-related questions and experiments can promote open science, share data and knowledge, and share the data and knowledge the scientific community has gained from it <ref type="bibr" target="#b25">[26]</ref>. This information should also be stored in the data lake.</p><p>Challenge 1: How to structure and organise life and earth sciences metadata? Metadata modelling is a way of structuring and organising earthquakes and biodi-versity. The metadata model must make the content of a data lake findable, accessible, interoperable and reusable (FAIR principles <ref type="bibr" target="#b26">[27]</ref>). Metadata can represent the data's structural, semantic and contextual aspects (provenance, conditions and assumptions under which the analytical results are obtained, i.e., the metadata driving the analysis). Most proposed models are based on logic or structured by graphs <ref type="bibr" target="#b28">[28,</ref><ref type="bibr" target="#b29">29]</ref> that can be specialised in seismic geophysical data and biodiversity. Besides, associating metadata can be achieved by considering quantitative and qualitative perspectives through data curation. Combining quantitative and qualitative approaches allows for a meta-model of the content used and produced in experiments and the conditions in which the content is produced, chosen, validated and considered representative knowledge for the domain of study.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Challenge 2: How to integrate data in the data lake?</head><p>Since the experiments require several data collections, integrating the data into the data lake must be part of a pipeline that includes data discovery, exploration, selection and integration. This process should be designed based on the requirements of life and earth science experiments <ref type="bibr" target="#b24">[25]</ref>. The heterogeneity of the data (text, signals, multimedia, proprietary formats from seismographs), the speed of the data often produced in the form of streams in the case of seismic sensors in addition to the volume are aspects that require original contributions in the design, maintenance and exploration of the data lake.</p><p>Challenge 3: How to integrate data in the data lake considering scientists' needs? The researcher's intervention, defined as a researcher-in-the-loop (RITL) <ref type="bibr" target="#b30">[30]</ref>, is a crucial aspect of human intervention to assess content concerning (i) the conditions in which it is produced and (ii) to make decisions about the new tasks to perform and the way a research project will move forward. RITL is a case of Human-in-the-loop (HITL), where the primary output of the process is a selection of the data, not a trained machine-learning model. HITL is crucial for handling supervision, exception control, optimisation, and maintenance <ref type="bibr" target="#b31">[31,</ref><ref type="bibr" target="#b32">32]</ref>. Under a RITL approach, a human sees all data points in the relevant selection at the end of the process. Using RITL requires a systematic solid way of working 15 . This characteristic is critical for designing content curation for quantitative and qualitative research methods.</p><p>Scientific content should be extracted and computed, including data, analytics tasks (manual and AI models), and associated metadata. This curated content allows the produced knowledge to be reusable and analytics results to be reproducible <ref type="bibr" target="#b33">[33]</ref>, thereby adhering to the FAIR principles <ref type="bibr" target="#b34">[34]</ref>. 15 https://hai.stanford.edu/news/humans-loop-design-interactive-ai-systems</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Towards a curation approach for building a Life &amp; Earth sciences data lake</head><p>Figure <ref type="figure" target="#fig_0">1</ref> illustrates the principle of our vision concerning the way a life and earth sciences data lake can be built, maintained and exploited. Our approach is based on the quantitative and qualitative curation of data harvested digitally and in situ (left-hand side of the figure). Heterogeneous raw data is gathered and stored in the data lake. Then, algorithms (statistical and Artificial Intelligence) and researchers can process, filter and classify data. This filtering process produces and stores meta-data in the data lake. Data exploration and integration (cleaning and engineering) processes can be performed on data samples from the data lake. They can be used for experimental purposes to produce content associated with the data stored in the data lake. Clean and curated data associated with meta-data representing the quantitative and qualitative perspective of the experiments can then be shared in a data verse (right-hand side of the figure).</p><p>Harvested data, models and knowledge integration. Various life and earth sciences data have been harvested from different sources. Since they are heterogeneous and produced at different paces (continuous and in batch), our approach proposes an integration approach based on a pivot meta-representation. The principle is to present a general meta-model of their content and process them for extracting technical, structural and semantic meta-data. This abstract representation provides integrated access to data collections and curated versions under a global knowledge graph and can promote their maintenance, analysis, and experimentation. It can also show the knowledge of the discipline with its vocabulary, concepts, and relations in a synthetic manner. The data lake can be pivotal in collecting, processing, and exporting raw data in a curated view.</p><p>Curation, maintenance, and exploration of data collections for bringing data value from in situ observations and experiments. Since data acts as a backbone in modelling phenomena for understanding their behaviour, it is critical to developing good collection and maintenance: which are available data collections? Are they complete? Which is their provenance? In which conditions were they collected? Have they been processed? In which cases have they been used, and what are the associated results? We propose techniques to explore data collections using graphs that can be explored and enriched while new data and analytics results are produced. Data curation also means keeping track of the type of experiments run on data, their results, and the conditions in which they were performed. Maintaining a Pilot experiments. The data lake will be tested in real scenarios through collaboration with domain experts in seismology and biodiversity studies in Brazil. The entry point will be two pilot experiments, namely:</p><p>1. the classification process of seismic signals collected by stations through different observations to detect "natural" and human-made earthquakes in the northern human-made earthquakes in the northern region of Brazil; 2. the classification of in situ observations of the "carabela portuguesa" 16 and modelling its behaviour on the Brazilian coast. 16 The Portuguese caravel (Physalia physalis) is a monotypic colonial species of siphonophore hydrozoan of the family Physaliidae. It is commonly found in the open ocean in all warm waters of the</p><p>In both cases, it is necessary to (i) apply statistical methods to investigate and unveil new patterns in seisms and biodiversity data, answering open problems or leading to new research questions; (ii) build predictive models to better describe or approximate phenomena, increasing the knowledge about our planet. The conditions in which statistics and prediction are performed, results, observations, interpretation and validation of the results are data to be integrated into the data lake.</p><p>Discussion. The originality of the work is to address the construction of a data lake that includes:</p><p>1. Raw collected data representing life and earth sciences phenomena (streams, batch, multimedia, proprietary). 2. Data produced along data-driven experiments adopting data science techniques including artificial intelligence algorithms (ML-driven data lakes). 3. Contextual data describing the conditions in which data are collected, and experiments are designed and enacted. The data lake will provide data curation modules for extracting metadata according to a well-adapted model and modules exploring data and using them for designing new experimentations, thereby adopting an open science perspective.</p><p>world, especially in the tropical and subtropical regions of the Pacific and Indian Oceans, as well as in the Atlantic Gulf Stream. Its sting is dangerous and very painful https://es.wikipedia.org/ wiki/Physalia_physalis.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusions and future work</head><p>Our vision is that it is necessary to address fundamental research topics at the centre of Data Science, Big Data management and analytics for solving data-driven problems in life and earth sciences. The contribution is the design and exploration techniques of a data lake with a well-adapted model for metadata about life and earth sciences experiments consuming and producing quantitative and qualitative data. An important work will be to define exploration operators and pipelines to exploit the content for further maintaining and developing new life and earth sciences experiments.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: General overview of the curation approach for building, maintaining and exploiting a data lake.</figDesc><graphic coords="6,99.71,84.18,395.87,206.16" type="bitmap" /></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">https://www.parsehub.com/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">https://80legs.com/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">https://www.octoparse.com/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_3">L Reips, M Musicante, G Vargas-Solar, ATR Pozo, C.S Hara, ENoW-Extrator de Dados de Notícias da Web, Demonstration Anais Estendidos do XXXVIII Simpósio Brasileiro de Bancos de Dados, 2023, 78-83</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_4">https://www.qdasoftware.org</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_5"> kaggle.com   </note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="7" xml:id="foot_6">https://datadryad.org/stash</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="8" xml:id="foot_7">https://www.dataone.org/about/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="9" xml:id="foot_8">https://www.re3data.org</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="10" xml:id="foot_9">https://jamesdixon.wordpress.com/2014/09/25/ data-lakes-revisited/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="11" xml:id="foot_10">https://arizona.figshare.com</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="12" xml:id="foot_11">https://www.fdm.uni-hamburg.de/en/fdm.html</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="13" xml:id="foot_12">https://www.icpsr.umich.edu/web/about/cms/2365</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="14" xml:id="foot_13">https://scienceouverte.couperin.org/cellule-data-grenoble-alpes/</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Acknowledgements</head><p>The work reported in this paper is done in the context of the LETITIA <ref type="bibr" target="#b16">17</ref> project, funded by the Fédération Informatique de Lyon <ref type="bibr" target="#b17">18</ref> .</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Laclichev: Exploring the history of climate change in latin america within newspapers digital collections</title>
		<author>
			<persName><forename type="first">G</forename><surname>Vargas-Solar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-L</forename><surname>Zechinelli-Martini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Espinosa-Oviedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">M</forename><surname>Vilches-Blázquez</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">New Trends in Database and Information Systems: ADBIS 2021 Short Papers, Doctoral Consortium and Workshops: DOING, SIMPDA, MADEISD, Mega-Data</title>
				<meeting><address><addrLine>CAoNS, Tartu, Estonia</addrLine></address></meeting>
		<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2021">August 24-26, 2021. 2021</date>
			<biblScope unit="page" from="121" to="132" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Noernberg, Redes sociais como uma fonte de dados alternativa para monitorar águas-vivas no brasil</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">S</forename><surname>Do Nascimento</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Hara</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">N</forename><surname>Junior</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Livro de Memórias do IV SUSTENTARE e VII WIPIS: Workshop internancional de Sustentabilidade, Indicadores e Gestão de Recursos Hídricos (Online) -Even3</title>
				<meeting><address><addrLine>Piracicaba</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Factextract: automatic collection and aggregation of articles and journalistic factual claims from online newspaper</title>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">N</forename><surname>Sarr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ousmane</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Diallo</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2018 Fifth International Conference on Social Networks Analysis, Management and Security (SNAMS), IEEE</title>
				<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="336" to="341" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<monogr>
		<author>
			<persName><forename type="first">D</forename><surname>Garkov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Braun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Weiskopf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Schreiber</surname></persName>
		</author>
		<title level="m">research data curation in visualization: Position paper</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note>data</note>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Leveraging machine learning to detect data curation activities</title>
		<author>
			<persName><forename type="first">S</forename><surname>Lafia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Thomer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Bleckley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Akmon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Hemphill</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE 17th International Conference on eScience (eScience)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2021">2021. 2021</date>
			<biblScope unit="page" from="149" to="158" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">What drives and inhibits researchers to share and use open research data? A systematic literature review to analyze factors influencing open research data adoption</title>
		<author>
			<persName><forename type="first">A</forename><surname>Zuiderwijk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Shinde</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Jeng</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">PloS One</title>
		<imprint>
			<biblScope unit="volume">15</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Curating research assets: A tutorial on the git version control system</title>
		<author>
			<persName><forename type="first">M</forename><surname>Vuorre</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Curley</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Advances in Methods and Practices in Psychological Science</title>
		<imprint>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="219" to="236" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<title level="m" type="main">Synchronic curation for assessing reuse and integration fitness of multiple data collections</title>
		<author>
			<persName><forename type="first">M</forename><surname>Esteva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Simone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Nagpal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Gupta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Jah</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Researcher reflexivity: exploring the impacts of caqdas use</title>
		<author>
			<persName><forename type="first">M</forename><surname>Woods</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Macklin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">K</forename><surname>Lewis</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ternational Journal of Social Research Methodology</title>
		<imprint>
			<biblScope unit="volume">19</biblScope>
			<biblScope unit="page" from="385" to="403" />
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<ptr target="https://atlasti.com,lastac-cessed" />
		<title level="m">ATLAS.ti</title>
				<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">Dedoose</forename><surname>Dedoose</surname></persName>
		</author>
		<ptr target="https://www.dedoose.com/" />
		<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<monogr>
		<author>
			<persName><forename type="first">V</forename><surname>Software</surname></persName>
		</author>
		<ptr target="http://maxqda.com,lastac-cessed" />
		<title level="m">Maxqda</title>
				<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<ptr target="https://www.qsrinternational.com/" />
		<title level="m">Nvivo</title>
				<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Using machine learning to support qualitative coding in social science: Shifting the focus to ambiguity</title>
		<author>
			<persName><forename type="first">N</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Drouhard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Kocielnik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Suh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Aragon</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ACM Transactions on Interactive Intelligent Systems</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<biblScope unit="page" from="1" to="20" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Current issues in qualitative data analysis software (qdas): A user and developer perspective</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">C</forename><surname>Evers</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The Qualitative Report</title>
		<imprint>
			<biblScope unit="volume">23</biblScope>
			<biblScope unit="page" from="61" to="73" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">How data curation enables epistemically responsible reuse of qualitative data</title>
		<author>
			<persName><forename type="first">S</forename><surname>Karcher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">D</forename><surname>Kirilova</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Pagé</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Weber</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The Qualitative Report</title>
		<imprint>
			<biblScope unit="volume">26</biblScope>
			<biblScope unit="page" from="1996" to="2010" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">Qdaminer</forename><surname>Qdaminer</surname></persName>
		</author>
		<ptr target="https://provalisresearch.com/products/qualitative-data-analysis-software/" />
		<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">Quirkos</forename><surname>Quirkos</surname></persName>
		</author>
		<ptr target="https://www.quirkos.com" />
		<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">Transana</forename><surname>Transana</surname></persName>
		</author>
		<ptr target="https://www.transana.com" />
		<imprint>
			<date type="published" when="2023-04">April 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">Leveraging the data lake: Current state and challenges</title>
		<author>
			<persName><forename type="first">C</forename><surname>Giebler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Gröger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Hoos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schwarz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mitschang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Big Data Analytics and Knowledge Discovery: 21st International Conference, DaWaK 2019</title>
				<meeting><address><addrLine>Linz, Austria</addrLine></address></meeting>
		<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2019">August 26-29, 2019. 2019</date>
			<biblScope unit="page" from="179" to="188" />
		</imprint>
	</monogr>
	<note>Proceedings 21</note>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<author>
			<persName><forename type="first">R</forename><surname>Hai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Quix</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Jarke</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2106.09592</idno>
		<title level="m">Data lake concept and systems: a survey</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Data warehouse modernization</title>
		<author>
			<persName><forename type="first">P</forename><surname>Russom</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">TDWI Best Pract Rep</title>
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<monogr>
		<title level="m" type="main">An introduction to the dataverse network as an infrastructure for data sharing</title>
		<author>
			<persName><forename type="first">G</forename><surname>King</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2007">2007</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Automating open science for big data</title>
		<author>
			<persName><forename type="first">M</forename><surname>Crosas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>King</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Honaker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Sweeney</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The ANNALS of the American Academy of Political and Social Science</title>
		<imprint>
			<biblScope unit="volume">659</biblScope>
			<biblScope unit="page" from="260" to="273" />
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Using provenance in data analytics for seismology: Challenges and directions</title>
		<author>
			<persName><forename type="first">U</forename><forename type="middle">S</forename><surname>Da Costa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Espinosa-Oviedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Musicante</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Vargas-Solar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-L</forename><surname>Zechinelli-Martini</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">European Conference on Advances in Databases and Information Systems</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="311" to="322" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">Towards a human-in-the-loop curation: A qualitative perspective</title>
		<author>
			<persName><forename type="first">A</forename><surname>Adorjan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Vargas-Solar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Motz</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE/ACS 19th International Conference on Computer Systems and Applications (AICCSA)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2022">2022. 2022</date>
			<biblScope unit="page" from="1" to="8" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">D</forename><surname>Wilkinson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dumontier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><forename type="middle">J</forename><surname>Aalbersberg</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Appleton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Axton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Baak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Blomberg</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-W</forename></persName>
		</author>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">The fair guiding principles for scientific data management and stewardship</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">B</forename><surname>Boiten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">E</forename><surname>Da Silva Santos</surname></persName>
		</author>
		<author>
			<persName><surname>Bourne</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Scientific data</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="1" to="9" />
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<author>
			<persName><forename type="first">E</forename><surname>Scholly</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">N</forename><surname>Sawadogo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Espinosa-Oviedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Favre</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Loudcher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Darmont</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Noûs</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">goldmedal: une nouvelle contribution à la modélisation générique des métadonnées des lacs de données</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Diouan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Ferey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Loudcher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Darmont</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Noûs</surname></persName>
		</author>
		<title level="m">Métadonnées des lacs de données et principes fair</title>
				<imprint>
			<publisher>EDA</publisher>
			<date type="published" when="2022">2022. 2022</date>
		</imprint>
	</monogr>
	<note>18e journées Business Intelligence et Big Data</note>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<title level="m">Zenodo: SciNLP: Natural Language Processing and Data Mining for Scientific Text</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
	<note>Researcher-in-theloop for systematic reviewing of text databases</note>
</biblStruct>

<biblStruct xml:id="b31">
	<analytic>
		<title level="a" type="main">Society-in-the-loop: programming the algorithmic social contract</title>
		<author>
			<persName><forename type="first">I</forename><surname>Rahwan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Ethics and information technology</title>
		<imprint>
			<biblScope unit="volume">20</biblScope>
			<biblScope unit="page" from="5" to="14" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Humanin-the-loop machine learning: A state of the art</title>
		<author>
			<persName><forename type="first">E</forename><surname>Mosqueira-Rey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Hernández-Pereira</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Alonso-Ríos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bobes-Bascarán</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Á</forename><surname>Fernández-Leal</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Artificial Intelligence Review</title>
		<imprint>
			<biblScope unit="volume">56</biblScope>
			<biblScope unit="page" from="3005" to="3054" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<analytic>
		<title level="a" type="main">The role of metadata in reproducible computational research</title>
		<author>
			<persName><forename type="first">J</forename><surname>Leipzig</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Nüst</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">T</forename><surname>Hoyt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Ram</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Greenberg</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Patterns</title>
		<imprint>
			<biblScope unit="volume">2</biblScope>
			<biblScope unit="page">100322</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">A fair model catalog for ontology-driven conceptual modeling research</title>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">P F</forename><surname>Barcelos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">P</forename><surname>Sales</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Fumagalli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Fonseca</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><forename type="middle">V</forename><surname>Sousa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Romanenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kritz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Guizzardi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Conceptual Modeling</title>
		<imprint>
			<biblScope unit="volume">73</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note>ER</note>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
