<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Combining Large Language Model Classifications and Active Learning for Improved Technology-Assisted Review</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Michiel</forename><forename type="middle">P</forename><surname>Bron</surname></persName>
							<email>m.p.bron@uu.nl</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Department of Information and Computing Sciences</orgName>
								<orgName type="department" key="dep2">Faculty of Science</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">The Netherlands&apos; National Police</orgName>
								<address>
									<settlement>The Hague</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Berend</forename><surname>Greijn</surname></persName>
							<email>b.greijn@uu.nl</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Department of Information and Computing Sciences</orgName>
								<orgName type="department" key="dep2">Faculty of Science</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Department of Methods and Statistics</orgName>
								<orgName type="department" key="dep2">Faculty of Social Sciences</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Bruno</forename><forename type="middle">Messina</forename><surname>Coimbra</surname></persName>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Department of Methods and Statistics</orgName>
								<orgName type="department" key="dep2">Faculty of Social Sciences</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Rens</forename><surname>Van De Schoot</surname></persName>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Department of Methods and Statistics</orgName>
								<orgName type="department" key="dep2">Faculty of Social Sciences</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ayoub</forename><surname>Bagheri</surname></persName>
							<email>a.bagheri@uu.nl</email>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Department of Methods and Statistics</orgName>
								<orgName type="department" key="dep2">Faculty of Social Sciences</orgName>
								<orgName type="institution">Utrecht University</orgName>
								<address>
									<settlement>Utrecht</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff3">
								<orgName type="institution">B. Messina Coimbra)</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff4">
								<address>
									<addrLine>B. Messina</addrLine>
									<postCode>0000-0001-7736-2091</postCode>
									<settlement>Coimbra, (R. van de Schoot)</settlement>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Combining Large Language Model Classifications and Active Learning for Improved Technology-Assisted Review</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">CB080963FF9364F5151C5F97E6529223</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:23+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>technology-assisted review</term>
					<term>active learning</term>
					<term>large language model</term>
					<term>information retrieval</term>
					<term>weak supervision</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Technology-assisted review (TAR) is software that aids in high-recall information retrieval tasks, such as abstract screening for systematic literature reviews. Often, TAR systems use a form of Active Learning (AL); during this process, human reviewers label documents as relevant or irrelevant according to a screening protocol, while the system incrementally updates a classifier based on the reviewers' previous decisions. After each model update, the system uses the classifier to rerank the remaining workload by prioritizing predicted relevant documents over irrelevant ones, enabling a reduced workload. Recently, studies have been performed that study the ability of solely using Large Language Models (LLMs) to perform this task by supplying the LLM prompts that contain the task, screening protocol, and a document from the corpus. The LLM then provides a classification of the document in question. While the results of these studies are promising, the LLM's predictions are not error-free, resulting in a recall or precision that is lower than desired. In this work, we propose a new Active Learning method for TAR that integrates the results of the LLM in the review process that may correct some of the shortcomings of the LLM results, leveraging a reduced workload with respect to current TAR systems.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Technology-assisted review (TAR) is software that aids in high-recall (information) retrieval (HRR) tasks. An example of such a task is performing a Systematic Literature Review (for example, in medicine <ref type="bibr" target="#b0">[1]</ref>), but there are also applications in the legal domain (e.g., e-Discovery <ref type="bibr" target="#b1">[2]</ref>, but also the processing of Freedom of Information Act Requests, criminal investigation, etc.). For all these search tasks, it is important that nearly all relevant information is found, so these have a recall target of 75 -100 % <ref type="bibr" target="#b2">[3]</ref>.</p><p>In these extensive studies, the researchers, attorneys, or investigators gather evidence or information by screening documents stored in large databases or corpora. The task is to find nearly all information relevant to the subject of the investigation. In the case of Systematic Literature Reviews, the researcher starts by using specialized search queries to select documents from databases. Formulating these queries is not a trivial task, as it is the objective to capture (nearly) all relevant documents. These queries should not be too restrictive to minimize the chance that a relevant document is missed; researchers often use disjunctions rather than conjunctions. Consequently, the resulting set of candidate documents the researchers process is often enormous, while the prevalence of relevant documents within these sets can be very low.</p><p>More formally, we can specify this task as follows: we have a dataset 𝒟 containing all the candidate documents found after the initial keyword search. During the review process, these documents are read by the domain experts and labeled as either relevant or irrelevant. Read documents are referred to as</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Typical process statistics for Systematic Literature Reviews. Users query multiple databases using keyword search, which yields a candidate set of records 𝒟 which are screened. In this work, we aim to optimize this screening phase. After the title-abstract screening phase, the reviewers will read the full-text of the remaining set of documents (𝒟 + ), which will determine the definitive eligibility for inclusion in the review or meta-analysis. labeled. During the process, we maintain two sets ℒ + and ℒ − for the labeled relevant and irrelevant documents. The remaining unlabeled documents belong to the set 𝒰. Traditionally, researchers screened all documents in 𝒟. Technology-Assisted Review are then systems or algorithms that aid the reviewers in reducing the reviewing workload <ref type="bibr" target="#b3">[4]</ref>, while still aiming to find all relevant documents 𝒟 + .</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Databases Keyword Search (𝒟) Title Abstract (𝒟</head><p>Early TAR methods consisted of first creating a randomly sampled subset of 𝒟 and training a classifier on the labeled dataset ℒ. Then, that classifier is used to classify the remaining documents in 𝒰 <ref type="bibr" target="#b4">[5]</ref>. Many recent TAR systems use a form of Active Learning to update the classifier after each or several review decisions iteratively <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8,</ref><ref type="bibr" target="#b8">9,</ref><ref type="bibr" target="#b9">10,</ref><ref type="bibr" target="#b10">11]</ref>. AL is a Machine Learning technique that is used to train a classifier with fewer labeled data points while retaining good performance. In this setting, the model can interactively query an oracle (i.e., the domain expert) to label data points with the desired output of the Machine Learning model (i.e., in the case of a classification task, the class of the data point). In our case, the model should predict each document's relevancy or inclusion status. In canonical Active Learning, the selection strategy aims to select the "most informative" examples from the perspective of the classifier. An example of such a strategy is Uncertainty Sampling <ref type="bibr" target="#b11">[12]</ref>. The goal of canonical AL is to create a good inductive classifier, that can be used to classify previously unseen documents not found in the pool of potential training examples.</p><p>Within TAR, the model is used in a transductive setting only, i.e., the model is only used to retrieve the relevant data within the pool. The model is not used after the retrieval task has been completed <ref type="bibr" target="#b12">[13]</ref>. Many TAR systems (e.g., <ref type="bibr" target="#b13">[14,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b8">9,</ref><ref type="bibr" target="#b7">8]</ref>) use relevance sampling <ref type="bibr" target="#b14">[15]</ref>, a greedy batch sampling method that selects a batch ℬ with the top-𝑘 documents with the highest probability of belonging to the class of relevant documents according to the trained model. After the annotation of each document in ℬ, the model is retrained, and a new ranking for the documents in 𝒰 is produced. The objective is then to find all the remaining unlabeled relevant documents belonging to the set 𝒰 + , while minimizing reading documents that belong to the set 𝒰 − .</p><p>For abstract screening, 𝒟 consists of title-abstract pairs, which the reviewers for eligibility for the researcher's systematic review or meta-analysis. The researchers follow a protocol that consists of inclusion and exclusion criteria to determine the eligibility of a record (in Section 4 -Figure <ref type="figure" target="#fig_3">2</ref>, an example of such a protocol is displayed). This protocol should be followed strictly to ensure fairness and mitigate bias. Typical statistics of this process are given in Table <ref type="table" target="#tab_0">1</ref>.</p><p>Eligibility cannot always be determined from the title-abstract pair only due to the limited amount of information stored there, so reading the full-text of the paper is necessary to decide on definitive eligibility. Reading the full-text is associated with a high cost. Title-abstract screening greatly reduces the number of papers that have to be screened fully. TAR systems then aid in reducing the number of irrelevant title-abstract pairs so that not all records have to be screened.</p><p>Recently, methods have been proposed that use generative Large Language Models (LLMs) systems to perform title-abstract screening (inter alia <ref type="bibr" target="#b15">[16,</ref><ref type="bibr" target="#b16">17,</ref><ref type="bibr" target="#b10">11,</ref><ref type="bibr" target="#b17">18]</ref>). The main approach is to prepare a prompt that delineates the task and specifies the criteria, followed by the title and abstract. After supplying the prompt to the LLM, it will provide an answer and a decision on the inclusion status of that record. Obtaining results can be automated by making a program or script that automatically processes a dataset through the models' API. In <ref type="bibr" target="#b15">[16]</ref>, the authors report a mean accuracy of ± 90 % with a recall of 76 %. However, the performance varied per dataset, with recall scores ranging from 59 % to 100 %. In another study, the reported precision is low for some datasets <ref type="bibr" target="#b10">[11]</ref>, which may result in a higher screening workload than current AL-based systems offer.</p><p>LLMs are prone to hallucination, where the LLMs generate responses that seem plausible but are factually incorrect <ref type="bibr" target="#b18">[19]</ref>. Moreover, LLMs are very eager to provide an answer even though there is no information provided in the LLM's training data or within the prompt to give a good answer <ref type="bibr" target="#b19">[20]</ref>. With the current limitations, using the LLMs to determine the inclusion status of the title and abstract pairs may not be reliable enough.</p><p>In <ref type="bibr" target="#b20">[21]</ref>, the authors propose a system that combines (canonical) AL with Weak Supervision (e.g., noisy labels provided by a black-box model). To our knowledge, a TAR method that combines AL and noisy labels (e.g., from an LLM or another model) has not been presented yet. In this work, we propose a system that combines LLM classifications and Active Learning to improve the efficacy of the TAR procedure. Our main contributions can be summarized as follows:</p><p>1. A system that provides more detailed LLM classifications for all the criteria in the screening protocol instead of a single binary label for inclusion. 2. A system that makes LLM classifications more transparent by making the LLM provide a detailed explanation for each classification. 3. An Active Learning method that incorporates the LLM results to reduce the workload of the review. 4. A preliminary experimental evaluation of our method and several suggestions for future work.</p><p>In the following section, we will briefly overview previous work on TAR, LLM classification and techniques for combining weak supervision and AL. After that, we will explain our method, which consists of an LLM classifier and an Active Learning method that incorporates its predictions. As the LLM classifier assigns labels to each specific criterion, we introduce a case study in which we study a novel dataset that contains labels for each record at the criterion level, enabling us to assess the performance of our method. Finally, we will present our initial experiments and results, followed by a discussion and suggestions for future work.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related Work</head><p>Most TAR approaches are based on the Continuous Active Learning (CAL) algorithm (see Algorithm 1) <ref type="bibr" target="#b21">[22]</ref>. In this process, a model is trained on the documents that have already been reviewed. The model is then used to rerank the remaining documents in 𝒰. Several CAL procedures <ref type="bibr" target="#b7">[8,</ref><ref type="bibr" target="#b22">23,</ref><ref type="bibr" target="#b8">9,</ref><ref type="bibr" target="#b6">7]</ref> require a set of seed documents provided by the reviewer. This set needs to contain at least one relevant document, but it does not need to be a document from 𝒟; it may also contain a description of the research topic as a pseudo-document. Additionally, one example of an irrelevant document is needed.</p><p>AutoTAR <ref type="bibr" target="#b13">[14]</ref> extends the CAL procedure, which is still considered state-of-the-art and has been included in many studies as a baseline, for example, when studying ideal performance vs. the performance of a stopping criterion <ref type="bibr" target="#b9">[10,</ref><ref type="bibr" target="#b23">24,</ref><ref type="bibr" target="#b24">25]</ref>. Instead of just training on the labeled documents ℒ + , ℒ − , it samples a set of documents from the unlabeled set 𝒰, which are temporarily assumed to be irrelevant; a fair assumption, given the low prevalence of relevant documents in most datasets. ASReview <ref type="bibr" target="#b8">[9]</ref>, opensource TAR software specialized for abstract screening, resamples the data to improve the performance in the presence of imbalanced training data. FASTREAD2 <ref type="bibr" target="#b6">[7]</ref> modifies the CAL procedure with the goal of detecting human errors during the review procedure, as noisy human labels may occur <ref type="bibr" target="#b25">[26]</ref>.</p><p>CAL, as described in Algorithm 1, leaves the question of a Stopping Criterion open (i.e., the Stop-pingCriterion procedure, line 15 in Algorithm 1, is not given). Formulating a good stopping criterion is an area of active research. Some practitioners use pragmatic criteria based on time constraints or stop when the returns diminish (e.g., when TAR proposes 𝑘 irrelevant documents in a row; however, specifying 𝑘 is target and topic dependent) <ref type="bibr" target="#b26">[27]</ref>. Several heuristics <ref type="bibr" target="#b13">[14,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b27">28,</ref><ref type="bibr" target="#b26">27]</ref> (for example, characteristics of the recall curve) have been proposed, as well as methods that change the CAL procedure to allow the use of statistical methods that predict when a recall target has been achieved (inter alia <ref type="bibr" target="#b9">[10,</ref><ref type="bibr" target="#b22">23,</ref><ref type="bibr" target="#b23">24]</ref>).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Algorithm 1</head><p>The Continuous Active Learning algorithm. The algorithm requires as parameters a dataset 𝒟, an unlabeled set of documents 𝒰, labeled documents ℒ + , ℒ − , a classifier 𝐶, a batch size 𝑘. The Active Learning procedure selects new documents according to the relevance predictions of the classifier 𝐶, which are updated after each batch of labeling decisions. The classifiers that are used in these systems are often based on classical Machine Learning algorithms like Multinomial Naïve Bayes, Logistic Regression (AutoTAR), and Support Vector Machines combined with TF-IDF features. However, some recent studies explore using neural networks and deep learning (e.g., <ref type="bibr" target="#b2">[3,</ref><ref type="bibr" target="#b28">29]</ref>).</p><p>This work focuses on applying TAR to aid abstract screening for systematic reviews. In this field, state-of-the-art systems can find (nearly all) after screening 5 -40 % of the corpus by using this general methodology <ref type="bibr" target="#b7">[8,</ref><ref type="bibr" target="#b8">9]</ref>, but performance is dataset and query dependent. A frequently used metric to assess the efficacy of TAR systems metric is Work Saved over Sampling (WSS) which indicates the work savings over the use of random sampling (i.e., traditional screening) <ref type="bibr" target="#b4">[5]</ref>. This metric can be calculated after the procedure was terminated after a stopping criterion was triggered or when a recall target has been achieved according to the ground truth; WSS@95, which indicates the the work savings over random sampling at the moment when 95 % recall is achieved, is a frequently used metric for TAR systems targeting Systematic Literature Reviews (inter alia <ref type="bibr" target="#b22">[23,</ref><ref type="bibr" target="#b7">8,</ref><ref type="bibr" target="#b8">9]</ref>).</p><p>In contrast to the AL-based methods, after the popularization of generative Large Language Models like ChatGPT-3.5 and GPT-4 <ref type="bibr" target="#b29">[30]</ref>, systems have been proposed that use these models to perform screening tasks. The main approach is to prepare a prompt that delineates the task and specifies the criteria, followed by the title and abstract <ref type="bibr" target="#b15">[16,</ref><ref type="bibr" target="#b16">17,</ref><ref type="bibr" target="#b31">31,</ref><ref type="bibr" target="#b17">18]</ref>. Many approaches use ChatGPT-3.5 or GPT-4 <ref type="bibr" target="#b15">[16]</ref>, several <ref type="bibr" target="#b10">[11,</ref><ref type="bibr" target="#b17">18]</ref> use open-source LLMs such as Llama 2 <ref type="bibr" target="#b32">[32]</ref>. In <ref type="bibr" target="#b17">[18]</ref>, a large simulation study is performed to assess the performance of several LLMs on popular TAR datasets (CLEF2017, CLEF2018, CLEF2019) <ref type="bibr" target="#b33">[33,</ref><ref type="bibr" target="#b34">34,</ref><ref type="bibr" target="#b35">35]</ref>; however, in this study, the LLM predicts the inclusion status only on the title of the systematic review, not its screening protocol (the CLEF datasets do not offer a lot of information on the screening protocol, although the keyword searches are available and a topic description is available). Contrary to the other methods, <ref type="bibr" target="#b17">[18]</ref> compares the next token probabilities of yes and no (which are used to indicate the inclusion decision), which can be used as a measure of confidence.</p><p>There have been several works that combine or compare LLMs and Active Learning. For example, in <ref type="bibr" target="#b36">[36]</ref>, the authors compare the performance of LLMs and models that have been trained with Active Learning. One of the findings is that with a limited number of labeled documents, the AL-trained models outperform the LLMs that perform zero-shot classification despite being significantly smaller in terms of training parameters. In <ref type="bibr" target="#b37">[37]</ref>, a method is proposed that integrates an LLM as an annotator for the creation of Named Entity Recognition (NER) models in underrepresented languages (e.g., African languages). Another work presents a method that generates synthetic data with LLMs, which are used to select the most interesting examples from the pool of unlabeled documents <ref type="bibr" target="#b39">[38]</ref>.</p><p>In <ref type="bibr" target="#b20">[21]</ref>, the authors present a method that combines AL with Weak Supervision and Transfer Learning. They present their results on training a classifier for classifying financial transactions (text data) in the presence of a black-box model (BBM) (a rule-based system). In this study, an annotator model is trained on agreement labels between the black-box model and the oracle's labels for each iteration along the typical classifier. The annotator model is used to determine per selected instance if the BBM's label can be trusted and accepted or if the human oracle should label it instead. With this method, the authors show that they could significantly lower annotation costs while retaining an accuracy close to the traditional AL setting.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Methodology</head><p>In this section, we describe the general architecture of our method. Our TAR procedure consists of two main components: a method to obtain classifications from the LLM and an Active Learning procedure that is used to rank the records during the review phase. Our AL procedure, LLM+CAL, uses the results of the LLM to reduce the review workload further.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Obtaining LLM classifications</head><p>In <ref type="bibr" target="#b31">[31,</ref><ref type="bibr" target="#b15">16]</ref>, a prompt contained the task and the full screening protocol. The task for the LLM was then to answer only with a final inclusion decision (e.g., choose between INCLUDE or EXCLUDE). This setup can be regarded as a black-box system, as it is impossible to determine any of its reasoning for making the decision. Also, the LLM does not provide any information about the confidence in its prediction besides a probability of predicting the token that represents the word INCLUDE or EXCLUDE over the space of all possible output tokens.</p><p>Chain-of-thought prompting is a method to improve the accuracy of LLMs when performing complex reasoning. With this method, it is specifically requested in the prompt to think step-by-step in addition to a few examples of appropriate answers. The aim is to let the LLM reason about its "thought process" verbosely, which results in a higher probability that the final answer is correct <ref type="bibr" target="#b40">[39]</ref>. By adjusting the prompt to let the LLM respond with chain-of-thought steps in a structured way, we aim to make the process more transparent for the reviewer. In addition, we ask the LLM to provide rationales (i.e., select fragments cited directly from the record in question), which enables tracing the decision to the source document. In Figure <ref type="figure" target="#fig_2">1</ref>, we display the prompt template that we use in our experiments, which containsbesides the instruction -a few examples of appropriate answers. We wrote a parser that parses the LLM answer into a structured datatype. In a real-world application, the rationales can be used to highlight fragments in the abstracts used in the LLM's decision-making, enabling easy verification and correction for the end-user in an annotation interface. Another significant difference between the studies in earlier work and ours is that we consider each criterion in the protocol separately. We noticed many classification errors in initial experiments when the whole screening protocol was considered. We list some major error categories below:</p><p>Hallucination. The model makes up factually incorrect but seemingly plausible answers.</p><p>Missing knowledge or context. The model does not know enough information about a topic that a human reviewer might know (e.g., technical jargon)</p><p>Incorrect reasoning. The information extraction works correctly, but the inclusion rules are not followed, causing a misclassification.</p><p>Ignoring instructions. Only a part of the screening protocol was used according to the LLM's chain-ofthought response. Some LLMs have problems following all instructions in the prompt, especially when the instructions are long and complex. Larger models like GPT-4 are less prone to this but have a higher computational and financial cost.</p><p>Often, the LLM followed the protocol partially: consider a dataset with four criteria, the LLM considered three criteria correctly but mistakenly ignored one of them, causing a misclassification of ASSIGNMENT: You are a helpful assistant who helps screen abstracts and titles of scientific papers. You answer questions by citing evidence in the given text followed by a YES or NO or UNKNOWN decision. When there is no evidence in the title and abstract, decide with UNKNOWN. Only answer with NO if there is absolute evidence given that the answer is NO. In the absence of evidence or when nothing is mentioned, always answer UNKNOWN. Use the following format: REASONING: (Think step by step to answer the question; use the information in the title and abstract and work your way to an answer. Your full reasoning and answer should be given in this field) EVIDENCE: (List sentences or phrases from the title and abstract used to answer the question in the previous field. Answer in bullets (e.g., -"quoted sentence"). Each quoted sentence should have its own line. If there is no evidence, write down []). In this field, only directly cite from the TITLE and ABSTRACT fields. <ref type="bibr">DO</ref>   the whole instance due to a mistake. This setup makes it challenging to detect failures due to a specific criterion. Mistakes become only apparent by combing through the (semi-structured) LLM answers containing information on all criteria.</p><p>We aim to mitigate this by considering each criterion separately, making the set of instructions shorter and less complex, which results in a higher accuracy. The system can then infer the inclusion status of a record by applying a simple logical formula to the model's decision on the criteria (for example, Figure <ref type="figure" target="#fig_3">2</ref>).</p><p>Despite the reduced complexity, it is still possible that the LLMs make classification errors, for example, due to hallucination, possibly because of missing knowledge. We hypothesize that these errors will not always happen at random, especially for the latter cause. Suppose the LLM makes an incorrect classification for a specific criterion due to missing knowledge. In that case, the LLM will likely make a similar mistake for instances similar to the one in question. Collecting the rationales and chain-of-thought fragments of misclassifications and training models on them might aid in predicting when the LLM makes a mistake or a correct decision.</p><p>We used LangChain <ref type="bibr" target="#b41">[40]</ref> to build our LLM classification pipeline. This package enables us to target multiple Large Language Models. In our experiments, we only worked with ChatGPT-3.5 (specifically, version 0301); however, the method can be applied to GPT-4 or models of other vendors, such as open-source models published on repositories like HuggingFace <ref type="bibr" target="#b42">[41]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Active Learning method</head><p>As in canonical TAR, we represent each document as a high-dimensional vector. A typical feature extraction method is a bag-of-words method like TF-IDF that TAR systems frequently use. Combining sparse feature matrices and classical machine learning methods offers fast retraining and reranking of the documents in 𝒰. The AutoTAR baseline uses TF-IDF combined with a Logistic Regression classifier. In our approach, we will also use TF-IDF and Logistic Regression to ensure that changes in performance are not due to changes in the document representation.</p><p>During the process, the labeling task is specified as follows: we have a feature space 𝒳 tiab , which contains the feature vectors of the title-abstract (tiab) records. Each document presented to the oracle gets, for each of the criteria (see Figure <ref type="figure" target="#fig_3">2</ref>), a label in the space 𝒴 crit 𝑖 = {+, ?, ¬} corresponding to True (Yes), Unknown, False (No). The option Unknown is vital in this phase, as it is not always the case that the information needed to determine eligibility for a criterion is present in the title and abstract.</p><p>Our method, LLM+CAL, consists of two phases: the first phase is called LLMPreferred, which is -in essence -a version of the method AutoTAR, but in this version constrained to select from the unlabeled documents that are included by the LLM (𝒰 ∩ ℒ {+,?} LLM ). As initial training data, the whole screening protocol is given in addition to a random sample of 100 LLM-excluded documents (ℒ − LLM ). This phase is applied until 25 consecutive irrelevant documents are proposed, which might indicate that the set of relevant documents may be exhausted.</p><p>Because the possibility exists that there are relevant documents that the LLM does not find, we will switch to the CriteriaWSA method, which can query all documents within 𝒰. First, all labeled data ℒ from the first phase is transferred to this method. Then, several machine learning models are trained: Inclusion Judgment Classifier. A Binary Classifier trained on the labeled data after transforming the data to 𝒴 binary = {+, ¬}, trained on the data in ℒ, in a similar fashion as AutoTAR. The criterion judgments are transformed using the formula specified in Figure <ref type="figure" target="#fig_3">2</ref>, which will result in a label in the space 𝒴 ternary = {+, ?, ¬}. We can then transform 𝒴 ternary to 𝒴 binary by changing each ? into a +.</p><p>Acceptance Classifier. A Binary Classifier that determines Acceptance for each inclusion criterion. This is similar to a method presented in <ref type="bibr" target="#b20">[21]</ref>. Here, for each criterion 𝑖, we obtain binary agreement labels 𝑧 ∈ 𝒵, where 𝒵 = {0, 1}. This is determined by comparing the LLM predictions and the labeled data in ℒ 𝑖 : each instance receives a label Accept (1) if the LLM prediction agrees with the human-annotated label. Otherwise, the label Reject (0) is given. However, contrary to the other models in our system and the method in <ref type="bibr" target="#b20">[21]</ref>, the model is not trained on the Title-Abstract records (𝒳 tiab ), but on the LLM's reasoning fragments 𝒳 ans 𝑖 (see Figure <ref type="figure">4</ref> for example data) of criterion 𝑖.</p><p>Given a TAR task that has four inclusion criteria ({𝑎, 𝑏, 𝑐, 𝑑}), we obtain the following pairs for each labeled for each labeled record:</p><formula xml:id="formula_0">• 𝒳 tiab × 𝒴 crit𝑎 × 𝒴 crit 𝑏 × 𝒴 crit𝑐 × 𝒴 crit 𝑑 • 𝒳 tiab × 𝒴 binary • 𝒳 tiab × 𝒴 ternary • 𝒳 ans𝑎 × 𝒵 𝑎 • 𝒳 ans 𝑏 × 𝒵 𝑏 • 𝒳 ans𝑐 × 𝒵 𝑐 • 𝒳 ans 𝑑 × 𝒵 𝑑</formula><p>During each annotation round, a batch of ten documents is given to the oracle using relevance sampling based on the ranking produced by the inclusion judgment classifier. The batch size of ten is an initial default value for this parameter. Smaller, larger, and dynamic batch sizes can be explored in future work. Another ten documents are sampled based on a ranking that is based on the predictions of the LLM and the Acceptance Classifier using the following equation:</p><formula xml:id="formula_1">score 𝑖 (𝑦 ^LLM 𝑖 , 𝑝 acc 𝑖 ) = ⎧ ⎨ ⎩ 0.75 + 0.25𝑝 acc 𝑖 if 𝑦 ^LLM 𝑖 = + 0.5 + 0.25𝑝 acc 𝑖 if 𝑦 ^LLM 𝑖 = ? 0.5(1 − 𝑝 acc 𝑖 ) if 𝑦 ^LLM 𝑖 = ¬ . (<label>1</label></formula><formula xml:id="formula_2">)</formula><p>Equation 1 is calculated for each study criterion 𝑖, where 𝑦 ^LLM 𝑖 is the LLM's prediction for criterion 𝑖 and 𝑝 acc 𝑖 is the corresponding acceptance probability. The mean of those scores is calculated for each of the unlabeled documents. Then, this score is used to rank the remaining documents in 𝒰. The rationale behind Equation 1 is that instances with a higher probability to be relevant (instances with criteria that have more True labels) are put before documents that have Unknown labels, followed by documents that have False labels. Labels that have False labels and a low acceptance probability will have a higher probability of being selected than documents with False labels that are certain. For the True and Unknown labels, the inverse holds if there is a higher acceptance probability, they are preferred over instances with lower acceptance probability. This is still an initial formulation that may not always work optimally; other options can be explored in future research.</p><p>After this batch of twenty documents has been prepared, they are given to the oracle for labeling unless the LLM has found exclusionary evidence for a specific criterion and its acceptance probability is above 80 % (unless that criterion is a reason for exclusion for all remaining documents in 𝒰); these examples are skipped but may be proposed again in another round if the acceptance probability drops below 80 %.</p><p>This process is repeated until a stopping criterion is triggered, the oracle decides to stop the review, or 𝒰 is exhausted. In our experiments, we will stop querying after reviewing |ℒ {+,?} LLM | documents.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Case Study</head><p>In this work, we compare the performance of various TAR methods on a dataset that is collected for a systematic review (at the time of writing in preparation) that aims to identify common latent groups or classes of PTSS/PTSD (Post-traumatic Stress Symptoms / Post-traumatic Stress Disorder) trajectories, as well as their prevalence and predictors, which may give a better understanding how and under what circumstances PTSS/PTSD presentations may develop <ref type="bibr" target="#b43">[42]</ref>. For this purpose, researchers reviewed a large corpus of records after querying several databases. During the review, the records were labeled on various levels, which we list below.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Inclusion Criteria:</head><p>𝑎 : Is the study a longitudinal/prospective study with at least three time point assessments? 𝑏 : Does the study assess PTSD symptoms as a continuous variable? [Followed by a list of eligible scales] 𝑐 : Does this study mention that individuals are exposed to traumatic events? 𝑑 : Did the study conduct a PTSD trajectory analysis? [Followed by a list of eligible methods]</p><p>A study 𝑠 can be included in the review when all criteria are satisfied (so, ∀𝑠 ∈ 𝒟 + , 𝑎(𝑠) ∧ 𝑏(𝑠) ∧ 𝑐(𝑠) ∧ 𝑑(𝑠)). Title. Some documents can be excluded by considering the title only. For example, animal studies are never eligible, and the fact that a study is an animal study can become clear from reading the title. We only study the records that have not been excluded by title screening.</p><p>Criterion. The eligibility of a study for inclusion depends on four inclusion criteria (see Figure <ref type="figure" target="#fig_3">2</ref>). For each criterion 𝑖 ∈ {𝑎, 𝑏, 𝑐, 𝑑}, a label 𝒴 crit 𝑖 = {+, ?, ¬}, corresponding to True, Unknown, False can be given. In Figure <ref type="figure" target="#fig_5">3</ref>, some statistics per criterion are displayed.</p><p>Title Abstract. Using the logical formula in Figure <ref type="figure" target="#fig_3">2</ref>, an inclusion judgment can be made for each criterion, so this level can be derived from the criterion level without additional human effort. This will result in a label in the space 𝒴 ternary = {+, ?, ¬}. Because an instance can have an Unknown label for one or more criteria, the final eligibility of such a study must be determined by reading the entire paper without exclusionary evidence in the record.</p><p>Full-text level: Final eligibility depends on reading the full-text of the study. This level is not considered in this work because this label needs more information than is available in this dataset (i.e., the full-text of every record).</p><p>This dataset is unique compared to other frequently used datasets used for benchmarking TAR systems (e.g., <ref type="bibr" target="#b33">[33,</ref><ref type="bibr" target="#b34">34,</ref><ref type="bibr" target="#b35">35]</ref>) have only binary inclusion information, sometimes only on the full-text level. Moreover, while these datasets are based on real-world search tasks, there is little to no information about the inclusion/exclusion criteria available. The SYNERGY <ref type="bibr" target="#b44">[43]</ref> corpus consists of several systematic reviews (including an earlier version of the PTSS dataset <ref type="bibr" target="#b45">[44]</ref>) with links to the publications from which the screening protocols can be obtained. Unfortunately, only inclusion labels on the full-text level are included, so we cannot study retrieval efficacy fairly (we can only consider recall of the set of papers that are included based on the full-text, which is a subset of the Title-Abstract included papers; therefore, we cannot distinguish title-abstract inclusions from the false positives). To our knowledge, the dataset used in this case study (for the systematic review in <ref type="bibr" target="#b43">[42]</ref>) is the only systematic review with labels on the criterion level.</p><p>We will consider only the records of one reviewer after title screening here, which results in a set of 4836 records after some data cleaning. Our dataset then contains |𝒟 {+,?} | = 183 records that are included on the title-abstract level, resulting in a prevalence of 3.78 %. One observation that can be drawn from Figure <ref type="figure" target="#fig_5">3</ref> is that criterion 𝑑 determines the title-abstract inclusion label (displayed as judgment) the most.  , which shows some of the relations between the labels, for example, that only a tiny subset of the documents for which 𝑏 is False, criterion 𝑐 is True. Also, it becomes clear that criterion 𝑑 excludes the most documents of all the criteria.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Experimental evaluation</head><p>We compare several methods in a small simulation study on the dataset described in the previous section.</p><p>• AutoTAR, a state-of-the-art TAR method, • The LLM Classifier, as described in Section 3.1, • LLM+CAL, our AL method that integrates the predictions of the LLM Classifier, as described in Section 3.2).</p><p>In this study, we only compare retrieval efficacy as we leave the question of a good stopping criterion open. Therefore, we constrain the run to the number of documents that are predicted by the LLM to be still eligible for inclusion (i.e., the number of documents with the label for which the inclusion judgment prediction is True or Unknown, |ℒ {+,?} LLM |). We let each algorithm run until this number is reached. Then, we can compare the performance of the LLM classifier and the AL-based methods with the same review effort. During the experiment, we will record when various recall levels are triggered. We will record the following metrics (calculated in the space 𝒴 binary ).</p><p>Recall. The percentage of relevant documents found based on the a priori knowledge from the ground truth dataset.</p><formula xml:id="formula_3">𝑅 = |ℒ + | |𝒟 + |<label>(2)</label></formula><p>Work Saved over Sampling. This metric expresses the work reduction over random sampling <ref type="bibr" target="#b4">[5]</ref>.</p><p>We calculate this as follows. We will record this value for several recall targets:</p><formula xml:id="formula_4">𝑊 𝑆𝑆 = |𝒰| |𝒟| − (︂ 1 − |ℒ + | |𝒟 + | )︂<label>(3)</label></formula><p>Equation 3 is used in the AL setting. In the context of a classifier, we equate 𝒰 to the set of documents predicted to be irrelevant (the reviewers do not read those documents). For the LLM Classifier, we can adapt the equation as follows.</p><formula xml:id="formula_5">𝑊 𝑆𝑆 = |ℒ − LLM | |𝒟| − (︂ 1 − |ℒ + LLM | |𝒟 + | )︂<label>(4)</label></formula><p>The rest of the section is structured as follows: first, we describe the results of the LLM classification, followed by the results of a simulation study in which we compare the aforementioned AL-based TAR methods.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.">LLM Classification results</head><p>In Figure <ref type="figure">4</ref>, we display an example of an annotated record. After parsing the response, we can highlight the fragments the LLM used in its decision-making. This overview is available for every instance in the dataset. When used in an annotation interface, the LLM explanations might aid users in their decision-making process, possibly reducing the screening time per document.</p><p>In Table <ref type="table">2</ref>, confusion matrices per criterion are displayed. A clear observation from Table <ref type="table">2</ref> is that the LLM is more cautious in excluding papers than the human reviewer: the confusion matrices show high numbers of studies with ground truth False and predictions Unknown for all criteria. One of the causes is that when there is no written evidence to make a decision about a criterion, for example, whether or not a PTSD trajectory analysis (criterion 𝑑) was performed, the LLM would predict Unknown. This might seem like the correct decision in this situation. However, experienced human reviewers might exclude a paper based on their knowledge of the field by inferring that from other characteristics (for example, when the abstract describes a methodology that makes it impossible to use one of the eligible methods).</p><p>The LLM's definition of specific terms or the meaning of concepts might diverge from the reviewers'. For example, for criterion 𝑐, in some cases, the LLM eagerly infers from the descriptions of the studied populations that these might be exposed to trauma, which might not explicitly be mentioned in the record. Fortunately, the number of falsely excluded documents per criterion is low.</p><p>When combining the LLMs prediction, we can infer the title-abstract level predictions using the logical formula specified in Figure <ref type="figure" target="#fig_3">2</ref>. In Table <ref type="table">2</ref>, the confusion matrix for this level is displayed, both on the ternary and binary levels. On this level, we obtain an accuracy of 78.52 % (ternary level), with a recall of 91.26 % on the binary level. In absolute numbers, this results in the fact that only 16 studies were missed out of the 183. The precision on the binary level is 12.9 %, resulting in a Work Saved over Sampling of 64.48 % (with Equation <ref type="formula" target="#formula_5">4</ref>). Depression: 2.26, 0.51-4.02; PTSD: 6.83, 4.21-9.44). CONCLUSION: Refugee women who experienced past-year intimate partner violence or conflict-related violence may be at increased risk of depression, anxiety or PTSD. Service providers should be aware that compared to the general refugee population, women who have experienced violence may require additional psychological support and recognise the enduring impact of violence that occurred before, during and after periods of conflict and tailor outreach and treatment services accordingly.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Reasoning:</head><p>¬𝑎 The title and abstract do not mention anything about a longitudinal/prospective study with multiple time-point assessments. As there is no evidence to suggest that the study is longitudinal/prospective with at least three-time point assessments, the answer is NO. ?𝑏 The question asks if the study assesses PTSD symptoms as a continuous variable and provides a list of eligible and ineligible PTSD scales. The abstract mentions that the study modelled the relationship between groups of women and symptomatic scores for anxiety, depression, and PTSD using linear regression. However, the abstract does not mention any specific PTSD scales used in the analysis. Therefore, it is not clear whether the study used any of the eligible or ineligible PTSD scales provided in the question, and the answer should be UNKNOWN. 𝑐 The study examines the association between gender-based violence (GBV) and mental health, including anxiety, depression, and post-traumatic stress disorder (PTSD), among women in a Kenyan refugee camp. The abstract mentions that conflict-affected settings put women at risk for GBV, which is associated with poor long-term mental health such as anxiety, depression, and PTSD. The study also found that conflict-related violence was an important risk factor for poor mental health among women who accessed GBV services, despite time since occurrence. Therefore, it is likely that the individuals in the study were exposed to traumatic events. ¬𝑑 The study aimed to investigate the relationship between GBV and mental health among Somali women in a Kenyan refugee camp. However, the methods section does not mention conducting a PTSD trajectory analysis.</p><p>Therefore, the answer is NO.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 4:</head><p>Here, we display an annotated record (title and abstract from <ref type="bibr" target="#b46">[45]</ref>) from the case study. This figure was generated by processing the responses generated by ChatGPT-3.5 to the prompts that were created for this record. The LLM cited the highlighted fragments as rationales for making the decision. Fragments containing evidence to include the record based on the criteria in Figure <ref type="figure" target="#fig_3">2</ref> are highlighted in green. Fragments highlighted in red are used to exclude a record. Yellow highlights indicate contradictory evidence, meaning the information is used as evidence for inclusion for one criterion and exclusionary evidence for another. Below the abstract, the reasoning of the LLM is listed per criterion. (Note that the breaks between highlights are automatically added to prevent overflowing lines during typesetting.) Run on a dataset with 183 inclusions out of 4836 100 % recall (183) 95 % recall (174) Exp. found at random # by AutoTAR WSS@60 -54.1 % WSS@65 -58.6 % WSS@70 -63.6 % WSS@75 -67.8 % WSS@80 -70.9 % WSS@85 -73.5 % WSS@90 -72.7 % WSS@95 -71.6 % Run on a dataset with 183 inclusions out of 4836 100 % recall (183) 95 % recall (174) Exp. found at random # by LLM+CAL # by LLMPreferred # by CriteriaWSA WSS@60 -56.8 % WSS@65 -61.5 % WSS@70 -66.3 % WSS@75 -69.8 % WSS@80 -74.0 % WSS@85 -77.9 % WSS@90 -81.3 % WSS@95 -81.6 % (b) LLM+CAL (ours)  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Discussion</head><p>We have shown some preliminary results on our method, which indicate that adding LLM predictions is beneficial to obtaining relevant documents at a lower cost than with the state-of-the-art method AutoTAR as our LLM+CAL method yields higher work savings at several recall targets. Moreover, a reviewer could achieve a better recall and WSS than obtained using only the LLM classifier. We have presented a system that builds upon earlier LLM methods for Systematic Literature Reviews by making the predictions more fine-grained by addressing each inclusion criterion separately. Moreover, our approach aims to make the predictions more accurate and explainable by leveraging chain-of-thought reasoning and asking the LLM to cite from the title-abstract record directly. Our method takes some ideas from <ref type="bibr" target="#b20">[21]</ref> in combining AL and the noisy labels from, in our case, an LLM annotator. We evaluated our method on a single dataset, which may impact the generalizability of our results. Unfortunately, testing on more datasets is not feasible at the time of writing, as our method requires that the dataset has criterion-level labels. It may be interesting if we can adapt the method to work with feedback on the binary inclusion level, which might enable us to consider more datasets that do not have labels this fine-grained. Another interesting avenue is comparing the performance of our method on different LLM results than presented here. The LLM predictions may slightly differ when another model is used or when alternative formulations of inclusion criteria and general instructions are used. Further investigation is needed to determine what impact non-optimal instructions have on the LLM's accuracy and the ability of our method to correct lower-quality weak labels.</p><p>The method we presented here is still relatively simple; several extensions can be made that might further improve the efficacy. For example, incorporating Transfer Learning (as in <ref type="bibr" target="#b20">[21]</ref>). Another area that can be explored further is the sampling strategy. Currently, our sampling strategy is based on a binary Logistic Regression classifier and TF-IDF features (as in AutoTAR). Considering other classifiers like Neural Networks and text embeddings like SentenceBERT <ref type="bibr" target="#b47">[46]</ref> might yield additional performance gains over traditional methods.</p><p>We currently do not use the criterion-level labels during model training and subsequently rank documents in 𝒰 with those models. Designing a good method that combines the results of the four classifiers in a ranking is not trivial. Equation 1 is a starting point (now applied to LLM only) but not optimal. Relations between criteria have also not been taken into account yet. For example, assume a scenario where, within nearly all labeled records in ℒ, the proposition 𝑎 ∧ 𝑏 ∧ 𝑑 → 𝑐 holds. When, for a new instance, the LLM predicts the following labels {𝑎, 𝑏, ¬𝑐, 𝑑}, this record may be an interesting example to review for the oracle because it is an exception to what has been seen so far.</p><p>So far, the LLM rationales have not been used to train the classifier. In <ref type="bibr" target="#b48">[47]</ref>, (human annotated) rationales were used as additional training data besides 𝒳 tiab for TAR for Systematic Literature Reviews, suggesting it might be beneficial to consider the LLM rationales during training as well.</p><p>As mentioned before, we have left the question of a stopping criterion open. One avenue could be to combine the method with an existing stopping criterion or to use the LLM predictions to determine an optimal stopping point.</p><p>During a review, regardless of whether it is performed in the traditional setting or with TAR, labeling mistakes occur due to human error <ref type="bibr" target="#b6">[7,</ref><ref type="bibr" target="#b25">26]</ref>. As in <ref type="bibr" target="#b20">[21]</ref>, our method assumes that the oracle always makes the correct decision; however, this may not always be the case. Presenting the LLM rationales and chain-of-thought fragments (like in Figure <ref type="figure">4</ref>) may help the oracle to make better decisions and prevent some mistakes, but the extent of this has to be further investigated. Also, the Active Learning part of our method could be adapted to consider the possibility of human errors.</p><p>We believe several ideas presented here might also benefit research areas other than TAR. For example, the LLM framework presented here can be applied to text classification tasks in general. However, adapting our method to a canonical AL setting is more appropriate in this setting. The framework we presented here enables obtaining weak labels at a low cost, with little engineering effort besides writing a good labeling protocol, and chain-of-thought prompting may aid in spotting errors within them, enabling more efficient creation of text classification models.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head></head><label></label><figDesc>NOT USE YOUR OWN WORDS, AND ADHERE TO THE LIST FORMAT! ANSWER: (Summarize your answer from the REASONING field with YES or NO or UNKNOWN. DO NOT WRITE ANYTHING AFTERWARDS IN THIS FIELD.) Write nothing else afterward. EXAMPLE RESPONSE 1: REASONING: To answer the question, we need to find information about [. . .]. The title and the abstract mention that [. . .]. Furthermore, the study aims to [. . .], suggesting that this is indeed the case. So, the answer to this question is YES. EVIDENCE: -"Sentence evidence 1" -"Sentence evidence 2" ANSWER: YES EXAMPLE RESPONSE 2: REASONING: To answer the question, we need to find information about [. . .]. The title and abstract say something about [. . .] but do not mention anything about [. . .]. As there is no definitive evidence, the answer should be UNKNOWN. EVIDENCE: [] ANSWER: UNKNOWN EXAMPLE RESPONSE 3: REASONING: To answer the question, we need to find information about [. . .]. The title and abstract say something about [. . .]. This statement rules out that [. . .]. As there is evidence to the contrary, the answer should be NO. EVIDENCE: -"Sentence evidence 1" ANSWER: NO TITLE: {title} ABSTRACT: {abstract} QUESTION: {question}</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: The prompt template that was used during the experiments. The first part delineates the task. The second paragraph contains instructions on formatting responses, with detailed instructions per field. Next, three example responses are given. Finally, the title, abstract, and one of the criteria are supplied.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: An excerpt of the screening protocol that was used within this case study.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 3 :</head><label>3</label><figDesc>Figure3: Label statistics displayed in an alluvial diagram, which shows some of the relations between the labels, for example, that only a tiny subset of the documents for which 𝑏 is False, criterion 𝑐 is True. Also, it becomes clear that criterion 𝑑 excludes the most documents of all the criteria.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head></head><label></label><figDesc>¬𝑎, ?𝑏, 𝑐, ¬𝑑 vs. Ground Truth ¬𝑎, ?𝑏, 𝑐, ¬𝑑 Title: Gender-based violence and its association with mental health among Somali women in a Kenyan refugee camp: a latent class analysis Abstract: BACKGROUND: c In conflict-affected settings, women and girls are c vulnerable to gender-based violence (GBV). GBV is c associated with poor long-term mental health such c as anxiety, depression and post-traumatic stress disorder (PTSD). Understanding the interaction between current violence and past conflict-related violence with ongoing mental health is essential for improving mental health service provision in refugee camps. METHODS: d Using data collected from 209 women attending GBV d case management centres in the Dadaab refugee camps, d Kenya, we grouped women by recent experience of GBV d using latent class analysis and modelled the relationship d between the groups and symptomatic scores for anxiety, d depression and PTSD using linear regression. RESULTS: Women with past-year experience of intimate partner violence alone may have a higher risk of depression than women with past-year experience of non-partner violence alone (Coef. 1.68, 95% CI 0.25 to 3.11). c,d Conflict-related violence was an important risk factor c,d for poor mental health among women who accessed GBV c,d services, despite time since occurrence (average time c,d in camp was 11.5 years) and even for those with a c,d past-year experience of GBV (Anxiety: 3.48, 1.85-5.10; c,d</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_9"><head>Figure 5 :</head><label>5</label><figDesc>Figure 5: Recall curves that show retrieval statistics for both methods on the dataset of the case study. The dashed blue diagonal line shows how many documents would have been found at random. The horizontal lines show the 95 and 100 % recall targets. The vertical dashed lines show when several recall targets have been achieved.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_10"><head>Figure 6 :</head><label>6</label><figDesc>Figure6: Here, we display, per recall target, the Work Saved over Sampling scores of the runs. We conducted multiple runs (𝑛 = 30) per method. It is clearly visible that our combined method (LLM+CAL) outperforms the AutoTAR baseline for every recall target.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>1 :</head><label>1</label><figDesc>procedure CAL(𝒟, 𝒰, ℒ + , ℒ − , 𝐶, 𝑘)</figDesc><table><row><cell>2:</cell><cell>𝑆 ← false</cell><cell>◁ Variable indicating whether CAL can be stopped</cell></row><row><cell>3:</cell><cell>while |𝒰| &gt; 0 and not 𝑆 do</cell><cell></cell></row><row><cell>4:</cell><cell>𝐶.Fit(ℒ + , ℒ − )</cell><cell></cell></row><row><cell>5:</cell><cell>ℬ ← Select(𝒰, 𝐶, 𝑘)</cell><cell></cell></row><row><cell>6:</cell><cell>for 𝑑 ∈ ℬ do</cell><cell></cell></row><row><cell>7:</cell><cell>𝑦 ← Review(𝑑)</cell><cell>◁ Performed by the human reviewer</cell></row><row><cell>8:</cell><cell>if 𝑦 = Relevant then</cell><cell></cell></row><row><cell>9:</cell><cell>ℒ + ← ℒ + ∪ {𝑑}</cell><cell></cell></row><row><cell>10:</cell><cell>else</cell><cell></cell></row><row><cell>11:</cell><cell>ℒ − ← ℒ − ∪ {𝑑}</cell><cell></cell></row><row><cell>12:</cell><cell>end if</cell><cell></cell></row><row><cell>13:</cell><cell>𝒰 ← 𝒰 ∖ {𝑑}</cell><cell></cell></row><row><cell>14:</cell><cell>end for</cell><cell></cell></row><row><cell>15:</cell><cell cols="2">𝑆 ← StoppingCriterion(𝒟, 𝒰, ℒ + , ℒ − , 𝐶, 𝑘)</cell></row><row><cell>16:</cell><cell>end while</cell><cell></cell></row><row><cell>17:</cell><cell>return ℒ + , ℒ −</cell><cell></cell></row><row><cell cols="2">18: end procedure</cell><cell></cell></row><row><cell cols="2">19: procedure Select(𝒰, 𝐶, 𝑘)</cell><cell></cell></row><row><cell>20:</cell><cell>P ← 𝐶.Predict(𝒰)</cell><cell>◁ Returns the relevance score for all 𝑑 in 𝒰</cell></row><row><cell>21:</cell><cell>R ←Rank(𝒰, P)</cell><cell></cell></row><row><cell>22:</cell><cell>ℬ ← Head(R, 𝒰, 𝑘)</cell><cell>◁ Gets the top-𝑘 documents</cell></row><row><cell>23:</cell><cell>return ℬ</cell><cell></cell></row><row><cell cols="2">24: end procedure</cell><cell></cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_0">Michiel P. Bron et al. CEUR Workshop Proceedings 77-95</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgements</head><p>We thank the anonymous reviewers for their insightful comments, which helped improve this article's quality. This work was sponsored by a grant from the Dutch Research Council (Domain Social Sciences and Humanities [SSH]), with file no. 406.22.GO.048. Moreover, this work was sponsored by a grant from the Human-Centered Artificial Intelligence focus area at Utrecht University.</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 2</head><p>Confusion matrices for the LLM classifier (rows: ground truth, columns: predictions) These results were obtained by classifying each document in the dataset using ChatGPT-3.5. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Criterion a</head><note type="other">True</note></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.2.">Active Learning methods</head><p>After obtaining the LLM's results, we conducted several simulation runs of the AutoTAR baseline and our LLM+CAL method. Because both methods contain components in which random sampling takes place, we performed 30 runs per method to account for this. We stopped each simulation run after supplying the oracle 1295 papers, which is the number of documents the LLM predicted to be included (|ℒ</p><p>Stopping at this moment allows a comparison of the LLM's recall to those of these methods given the same human reviewing effort. The recall curves of the methods are displayed in Figure <ref type="figure">5</ref>. The mean recall (after stopping the simulation) of the AutoTAR method is 96.52 %, which is above the recall obtained with the LLM given the same human review effort. With the combined method, a similar recall is obtained (96.68 %), finding 177 out of 183 documents, reducing the number of missed studies from 16 to 6.</p><p>The mean recall after stopping the simulation is roughly the same for both AL methods. However, when considering other recall targets, it is evident that our combined method outperforms the baseline. For example, at 95 % recall, our method has a mean WSS@95 of 80.53 % versus 71.41 % of AutoTAR. This indicates that using the LLM predictions gives an additional advantage in retrieving relevant documents faster. In Figure <ref type="figure">6</ref>, we give an overview of the performance for several other targets, of which all indicate that the LLM+CAL method outperforms the AutoTAR baseline. </p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Semi-automated screening of biomedical citations for systematic reviews</title>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">C</forename><surname>Wallace</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">A</forename><surname>Trikalinos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lau</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Brodley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">H</forename><surname>Schmid</surname></persName>
		</author>
		<idno type="DOI">10.1186/1471-2105-11-55</idno>
	</analytic>
	<monogr>
		<title level="j">BMC Bioinformatics</title>
		<imprint>
			<biblScope unit="volume">11</biblScope>
			<biblScope unit="issue">1 11</biblScope>
			<biblScope unit="page" from="1" to="11" />
			<date type="published" when="2010">2010. 2010</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<title level="m">Perspectives on Predictive Coding: And Other Advanced Search Methods for the Legal Practitioner</title>
				<editor>
			<persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Baron</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><forename type="middle">C</forename><surname>Losey</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">M</forename><forename type="middle">D</forename><surname>Berman</surname></persName>
		</editor>
		<meeting><address><addrLine>Chicago, Illinois</addrLine></address></meeting>
		<imprint>
			<publisher>American Bar Association</publisher>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
	<note>American Bar Association</note>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Goldilocks: Just-Right Tuning of BERT for Technology-Assisted Review</title>
		<author>
			<persName><forename type="first">E</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Macavaney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">D</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Frieder</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-3-030-99736-6_34</idno>
	</analytic>
	<monogr>
		<title level="m">Advances in Information Retrieval</title>
				<editor>
			<persName><forename type="first">M</forename><surname>Hagen</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Verberne</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Macdonald</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Seifert</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Balog</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Nørvåg</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">V</forename><surname>Setty</surname></persName>
		</editor>
		<meeting><address><addrLine>Cham</addrLine></address></meeting>
		<imprint>
			<publisher>Springer International Publishing</publisher>
			<date type="published" when="2022">2022</date>
			<biblScope unit="volume">13185</biblScope>
			<biblScope unit="page" from="502" to="517" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Information Retrieval for E-Discovery</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">W</forename><surname>Oard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Webber</surname></persName>
		</author>
		<idno type="DOI">10.1561/1500000025</idno>
	</analytic>
	<monogr>
		<title level="j">Foundations and Trends® in Information Retrieval</title>
		<imprint>
			<biblScope unit="volume">7</biblScope>
			<biblScope unit="page" from="99" to="237" />
			<date type="published" when="2013">2013</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Reducing workload in systematic review preparation using automated citation classification</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Cohen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">R</forename><surname>Hersh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Peterson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">Y</forename><surname>Yen</surname></persName>
		</author>
		<idno type="DOI">10.1197/jamia.M1929</idno>
	</analytic>
	<monogr>
		<title level="j">Journal of the American Medical Informatics Association</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page" from="206" to="219" />
			<date type="published" when="2006">2006</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Engineering Quality and Reliability in Technology-Assisted Review</title>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">V</forename><surname>Cormack</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">R</forename><surname>Grossman</surname></persName>
		</author>
		<idno type="DOI">10.1145/2911451.2911510</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 39th International ACM SIGIR Conference on Research and Development in Information Retrieval -SIGIR &apos;16</title>
				<meeting>the 39th International ACM SIGIR Conference on Research and Development in Information Retrieval -SIGIR &apos;16<address><addrLine>New York, New York, USA</addrLine></address></meeting>
		<imprint>
			<publisher>ACM Press</publisher>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="75" to="84" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">FAST2: An intelligent assistant for finding relevant papers</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Menzies</surname></persName>
		</author>
		<idno type="DOI">10.1016/j.eswa.2018.11.021</idno>
	</analytic>
	<monogr>
		<title level="j">Expert Systems with Applications</title>
		<imprint>
			<biblScope unit="volume">120</biblScope>
			<biblScope unit="page" from="57" to="71" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Research Screener: A machine learning tool to semi-automate abstract screening for systematic reviews</title>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">E K</forename><surname>Chai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">L J</forename><surname>Lines</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">F</forename><surname>Gucciardi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ng</surname></persName>
		</author>
		<idno type="DOI">10.1186/s13643-021-01635-3</idno>
	</analytic>
	<monogr>
		<title level="j">Systematic Reviews</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page">93</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">An open source machine learning framework for efficient and transparent systematic reviews</title>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Schram</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zahedi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Boer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Weijdema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Kramer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Huijts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Hoogerwerf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Ferdinands</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Harkema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Willemsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Fang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hindriks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Tummers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">L</forename><surname>Oberski</surname></persName>
		</author>
		<idno type="DOI">10.1038/s42256-020-00287-7</idno>
	</analytic>
	<monogr>
		<title level="j">Nature Machine Intelligence</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="125" to="133" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">When to Stop Reviewing in Technology-Assisted Reviews: Sampling from an Adaptive Distribution to Estimate Residual Relevant Documents</title>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Kanoulas</surname></persName>
		</author>
		<idno type="DOI">10.1145/3411755</idno>
	</analytic>
	<monogr>
		<title level="j">ACM Transactions on Information Systems</title>
		<imprint>
			<biblScope unit="volume">38</biblScope>
			<biblScope unit="page" from="1" to="36" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Title and abstract screening for literature reviews using large language models: An exploratory study in the biomedical domain</title>
		<author>
			<persName><forename type="first">F</forename><surname>Dennstädt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zink</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">M</forename><surname>Putora</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hastings</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Cihoric</surname></persName>
		</author>
		<idno type="DOI">10.1186/s13643-024-02575-4</idno>
	</analytic>
	<monogr>
		<title level="j">Systematic Reviews</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page">158</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">A Sequential Algorithm for Training Text Classifiers</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">D</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">A</forename><surname>Gale</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-1-4471-2099-5_1</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 17th Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval</title>
				<editor>
			<persName><forename type="first">W</forename><forename type="middle">B</forename><surname>Croft</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Van Rijsbergen</surname></persName>
		</editor>
		<meeting>the 17th Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval<address><addrLine>Dublin, Ireland</addrLine></address></meeting>
		<imprint>
			<publisher>ACM/Springer</publisher>
			<date type="published" when="1994-07-06">3-6 July 1994. 1994</date>
			<biblScope unit="page" from="3" to="12" />
		</imprint>
	</monogr>
	<note>Special Issue of the SIGIR Forum</note>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Reproducibility and Data Storage for Active Learning-Aided Systematic Reviews</title>
		<author>
			<persName><forename type="first">P</forename><surname>Lombaers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<idno type="DOI">10.3390/app14093842</idno>
	</analytic>
	<monogr>
		<title level="j">Applied Sciences</title>
		<imprint>
			<biblScope unit="volume">14</biblScope>
			<biblScope unit="page">3842</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<title level="m" type="main">Autonomy and Reliability of Continuous Active Learning for Technology-Assisted Review</title>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">V</forename><surname>Cormack</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">R</forename><surname>Grossman</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1504.06868</idno>
		<imprint>
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Improving retrieval performance by relevance feedback</title>
		<author>
			<persName><forename type="first">G</forename><surname>Salton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Buckley</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">J. Am. Soc. Inf. Sci</title>
		<imprint>
			<biblScope unit="volume">41</biblScope>
			<biblScope unit="page" from="288" to="297" />
			<date type="published" when="1990">1990</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Automated Paper Screening for Clinical Reviews Using Large Language Models: Data Analysis Study</title>
		<author>
			<persName><forename type="first">E</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Gupta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Deng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y.-J</forename><surname>Park</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Paget</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Naugler</surname></persName>
		</author>
		<idno type="DOI">10.2196/48996</idno>
	</analytic>
	<monogr>
		<title level="j">Journal of Medical Internet Research</title>
		<imprint>
			<biblScope unit="volume">26</biblScope>
			<biblScope unit="page">e48996</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title level="m" type="main">Automated title and abstract screening for scoping reviews using the GPT-4 Large Language Model</title>
		<author>
			<persName><forename type="first">D</forename><surname>Wilkins</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2311.07918</idno>
		<idno type="arXiv">arXiv:2311.07918</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title level="m" type="main">Zero-shot Generative Large Language Models for Systematic Review Screening Automation</title>
		<author>
			<persName><forename type="first">S</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Scells</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Potthast</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Koopman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Zuccon</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2401.06320</idno>
		<idno type="arXiv">arXiv:2401.06320</idno>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Hallucinations in Large Multilingual Translation Models</title>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">M</forename><surname>Guerreiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Alves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Waldendorf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Haddow</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Birch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Colombo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">F T</forename><surname>Martins</surname></persName>
		</author>
		<idno type="DOI">10.1162/tacl_a_00615</idno>
	</analytic>
	<monogr>
		<title level="j">Transactions of the Association for Computational Linguistics</title>
		<imprint>
			<biblScope unit="volume">11</biblScope>
			<biblScope unit="page" from="1500" to="1517" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Shi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Ding</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Balachandran</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tsvetkov</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2402.00367</idno>
		<idno type="arXiv">arXiv:2402.00367</idno>
		<title level="m">Don&apos;t Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM Collaboration</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">Enhancing Active Learning with Weak Supervision and Transfer Learning by Leveraging Information and Knowledge Sources</title>
		<author>
			<persName><forename type="first">L</forename><surname>Rauch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Huseljic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Sick</surname></persName>
		</author>
		<ptr target="org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Workshop on Interactive Adaptive Learning Co-Located with European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2022)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<editor>
			<persName><forename type="first">D</forename><surname>Kottke</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">G</forename><surname>Krempl</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Holzinger</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">B</forename><surname>Hammer</surname></persName>
		</editor>
		<meeting>the Workshop on Interactive Adaptive Learning Co-Located with European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2022)<address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022-09-23">September 23, 2022. 2022</date>
			<biblScope unit="volume">3259</biblScope>
			<biblScope unit="page" from="27" to="42" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Evaluation of machine-learning protocols for technology-assisted review in electronic discovery</title>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">V</forename><surname>Cormack</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">R</forename><surname>Grossman</surname></persName>
		</author>
		<idno type="DOI">10.1145/2600428.2609601</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 37th International ACM SIGIR Conference on Research &amp; Development in Information Retrieval, SIGIR &apos;14</title>
				<meeting>the 37th International ACM SIGIR Conference on Research &amp; Development in Information Retrieval, SIGIR &apos;14<address><addrLine>New York, NY, USA</addrLine></address></meeting>
		<imprint>
			<publisher>Association for Computing Machinery</publisher>
			<date type="published" when="2014">2014</date>
			<biblScope unit="page" from="153" to="162" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Statistical stopping criteria for automated screening in systematic reviews</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">W</forename><surname>Callaghan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Müller-Hansen</surname></persName>
		</author>
		<idno type="DOI">10.1186/s13643-020-01521-4</idno>
	</analytic>
	<monogr>
		<title level="j">Systematic Reviews</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="page" from="1" to="14" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<title level="m" type="main">Using Chao&apos;s Estimator as a Stopping Criterion for Technology-Assisted Review</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">P</forename><surname>Bron</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">G M</forename><surname>Van Der Heijden</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">J</forename><surname>Feelders</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">P J M</forename><surname>Siebes</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2404.01176</idno>
		<idno type="arXiv">arXiv:2404.01176</idno>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Stopping Methods for Technology-assisted Reviews Based on Point Processes</title>
		<author>
			<persName><forename type="first">M</forename><surname>Stevenson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Bin-Hezam</surname></persName>
		</author>
		<idno type="DOI">10.1145/3631990</idno>
	</analytic>
	<monogr>
		<title level="j">ACM Transactions on Information Systems</title>
		<imprint>
			<biblScope unit="volume">42</biblScope>
			<biblScope unit="page">37</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">Machine learning to optimize literature screening in medical guideline development</title>
		<author>
			<persName><forename type="first">W</forename><surname>Harmsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Groot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Harkema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Van Dusseldorp</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Van Den Brand</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<idno type="DOI">10.1186/s13643-024-02590-5</idno>
	</analytic>
	<monogr>
		<title level="j">Systematic Reviews</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page">177</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">The SAFE procedure: A practical stopping heuristic for active learningbased screening in systematic reviews and meta-analyses</title>
		<author>
			<persName><forename type="first">J</forename><surname>Boetje</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<idno type="DOI">10.1186/s13643-024-02502-7</idno>
	</analytic>
	<monogr>
		<title level="j">Systematic Reviews</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page">81</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">Heuristic stopping rules for technology-assisted review</title>
		<author>
			<persName><forename type="first">E</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">D</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Frieder</surname></persName>
		</author>
		<idno type="DOI">10.1145/3469096.3469873</idno>
	</analytic>
	<monogr>
		<title level="m">DocEng 2021 -Proceedings of the 2021 ACM Symposium on Document Engineering</title>
				<meeting><address><addrLine>Limerick, Ireland</addrLine></address></meeting>
		<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2021">2021</date>
			<biblScope unit="volume">31</biblScope>
			<biblScope unit="page">10</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">Active learning-based systematic reviewing using switching classification models: The case of the onset, maintenance, and relapse of depressive disorders</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">J</forename><surname>Teijema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Hofstee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Brouwer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Ferdinands</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Boer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Vizan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Van Den Brand</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bockting</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bagheri</surname></persName>
		</author>
		<idno type="DOI">10.3389/frma.2023.1178181</idno>
	</analytic>
	<monogr>
		<title level="j">Frontiers in Research Metrics and Analytics</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Openai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Achiam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Adler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Ahmad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">L</forename><surname>Akkaya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Aleman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Almeida</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Altenschmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Altman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Anadkat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Avila</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Babuschkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Balaji</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Balcom</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Baltescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bavarian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Belgum</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bello</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Berdine</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bernadett-Shapiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Berner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Bogdonoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Boiko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A.-L</forename><surname>Boyd</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Brakman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Brockman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Brooks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Brundage</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Button</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Cai</surname></persName>
		</author>
		<author>
			<persName><surname>Campbell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Michiel</surname></persName>
		</author>
		<author>
			<persName><surname>Bron</surname></persName>
		</author>
		<title level="m">CEUR Workshop Proceedings</title>
				<imprint>
			<biblScope unit="page" from="77" to="95" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">A</forename><surname>Cann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Carey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Carlson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Carmichael</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Chantzis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Chu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">W</forename><surname>Chung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Cummings</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Currier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Dai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Decareaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Degry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Deutsch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Deville</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Dhar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dohan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Dowling</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Dunning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ecoffet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Eleti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Eloundou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Farhi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Fedus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Felix</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Fishman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Forte</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Fulford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Georges</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Gibson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Goel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Gogineni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Goh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Gontijo-Lopes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gordon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Grafstein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Greene</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gross</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hallacy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Han</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Harris</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Heaton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Heidecke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hesse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Hickey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Hickey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Hoeschele</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Houghton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Hsu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Huizinga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Jang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Jomoto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Jonn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Kaftan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ł</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kamali</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Kanitscheider</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">S</forename><surname>Keskar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Khan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kilpatrick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">W</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Kirchner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kiros</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Knight</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kokotajlo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ł</forename><surname>Kondraciuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kondrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Konstantinidis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Kosic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Krueger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kuo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lampe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Lan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Leike</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Leung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Lim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Litwin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lopez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Lowe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Lue</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Makanju</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Malfacini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Manning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Markov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Markovski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Mayer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mayne</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mcgrew</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Mckinney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Mcleavey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mcmillan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mcneil</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Medina</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Menick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Metz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mishchenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mishkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Monaco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Morikawa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Mossing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Murati</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Murk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Mély</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Nair</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Nakano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Nayak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Neelakantan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Ngo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Noh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ouyang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>O'keefe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pachocki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Paino</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Palermo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pantuliano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Parascandolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Parish</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Parparita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Passos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Pavlov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Perelman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">D A B</forename><surname>Peres</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Petrov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">P D O</forename><surname>Pinto</surname></persName>
		</author>
		<author>
			<persName><surname>Michael</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Pokorny</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><forename type="middle">H</forename><surname>Pokrass</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Pong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Powell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Power</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Power</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Proehl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Puri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rae</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Raymond</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Real</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Rimbach</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Ross</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Rotsted</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Roussez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ryder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Saltarelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Sanders</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Santurkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schnurr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schulman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Selsam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Sheppard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sherbakov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shieh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Shoker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shyam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Sidor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Sigler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Simens</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Sitkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Slama</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Sohl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sokolowsky</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">P</forename><surname>Staudacher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Such</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Summers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sutskever</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">B</forename><surname>Tezak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Thompson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tillet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Tootoonchian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Tseng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tuggle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Turley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">F C</forename><surname>Tworek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Uribe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Vallone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Vijayvergiya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Voss</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">J</forename><surname>Wainwright</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ward</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Weinmann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Welihinda</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Welinder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Weng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Weng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Wiethoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Willner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wolrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Workman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Yoo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Zaremba</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zellers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Zhuk</surname></persName>
		</author>
		<author>
			<persName><surname>Zoph</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2303.08774v6" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">GPT-4 Technical Report</note>
</biblStruct>

<biblStruct xml:id="b31">
	<monogr>
		<title level="m" type="main">Assessing the Ability of ChatGPT to Screen Articles for Systematic Reviews</title>
		<author>
			<persName><forename type="first">E</forename><surname>Syriani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>David</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Kumar</surname></persName>
		</author>
		<idno type="DOI">10.48550/ARXIV.2307.06464</idno>
		<idno type="arXiv">arXiv:2307.06464</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<monogr>
		<author>
			<persName><forename type="first">H</forename><surname>Touvron</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Stone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Albert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Almahairi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Babaei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Bashlykov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Batra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Bhargava</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bhosale</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Bikel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Blecher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Canton-Ferrer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Cucurull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Esiobu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fernandes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Fuller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Goswami</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Hartshorn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hosseini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Hou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Inan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kardas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kerkez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Khabsa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Kloumann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Korenev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S</forename><surname>Koura</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-A</forename><surname>Lachaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lavril</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Liskovich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Mao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Martinet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mihaylov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mishra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Molybog</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Nie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Poulton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Reizenstein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rungta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Saladi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Schelten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Silva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">M</forename><surname>Smith</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Subramanian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><forename type="middle">E</forename><surname>Tan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Taylor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Williams</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">X</forename><surname>Kuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Zarov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kambadur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rodriguez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Stojnic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Edunov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Scialom</surname></persName>
		</author>
		<idno type="DOI">10.48550/ARXIV.2307.09288</idno>
		<idno type="arXiv">arXiv:2307.09288</idno>
		<title level="m">Llama 2: Open Foundation and Fine-Tuned Chat Models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<analytic>
		<title level="a" type="main">CLEF 2017 technologically assisted reviews in empirical medicine overview</title>
		<author>
			<persName><forename type="first">E</forename><surname>Kanoulas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Azzopardi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Spijker</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">CEUR Workshop Proceedings</title>
		<imprint>
			<biblScope unit="volume">1866</biblScope>
			<biblScope unit="page" from="1" to="29" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">CLEF 2018 technologically assisted reviews in empirical medicine overview: 19th Working</title>
		<author>
			<persName><forename type="first">E</forename><surname>Kanoulas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Azzopardi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Spijker</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Notes of CLEF Conference and Labs of the Evaluation Forum, CLEF 2018</title>
				<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page">2125</biblScope>
		</imprint>
	</monogr>
	<note>CEUR Workshop Proceedings</note>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">CLEF 2019 technology assisted reviews in empirical medicine overview</title>
		<author>
			<persName><forename type="first">E</forename><surname>Kanoulas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Azzopardi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Spijker</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">CEUR Workshop Proceedings</title>
		<imprint>
			<biblScope unit="volume">2380</biblScope>
			<biblScope unit="page" from="9" to="12" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<monogr>
		<title level="m" type="main">Human Still Wins over LLM: An Empirical Study of Active Learning on Domain-Specific Annotation Tasks</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">J</forename></persName>
		</author>
		<author>
			<persName><forename type="first">.-J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Wang</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2311.09825</idno>
		<idno type="arXiv">arXiv:2311.09825</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b37">
	<analytic>
		<title level="a" type="main">LLMs in the Loop: Leveraging</title>
		<author>
			<persName><forename type="first">N</forename><surname>Kholodna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Julka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Khodadadi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">N</forename><surname>Gumus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Granitzer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CEUR Workshop Proceedings</title>
				<editor>
			<persName><forename type="first">P</forename><surname>Michiel</surname></persName>
		</editor>
		<editor>
			<persName><surname>Bron</surname></persName>
		</editor>
		<imprint>
			<biblScope unit="page" from="77" to="95" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b38">
	<monogr>
		<idno type="DOI">10.48550/arXiv.2404.02261</idno>
		<idno type="arXiv">arXiv:2404.02261</idno>
		<title level="m">Large Language Model Annotations for Active Learning in Low-Resource Languages</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b39">
	<monogr>
		<title level="m" type="main">SQBC: Active Learning using LLM-Generated Synthetic Data for Stance Detection in Online Political Discussions</title>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Wagner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Behrendt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ziegele</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Harmeling</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2404.08078</idno>
		<idno type="arXiv">arXiv:2404.08078</idno>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b40">
	<analytic>
		<title level="a" type="main">Chain-of-Thought Prompting Elicits Reasoning in Large Language Models</title>
		<author>
			<persName><forename type="first">J</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schuurmans</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bosma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Ichter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Chi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><forename type="middle">V</forename><surname>Le</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Zhou</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Advances in Neural Information Processing Systems</title>
		<imprint>
			<biblScope unit="volume">35</biblScope>
			<biblScope unit="page" from="24824" to="24837" />
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b41">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">H</forename><surname>Chase</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Langchain</forename></persName>
		</author>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b42">
	<monogr>
		<title level="m" type="main">HuggingFace&apos;s Transformers: State-of-the-art Natural Language Processing</title>
		<author>
			<persName><forename type="first">T</forename><surname>Wolf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Debut</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Sanh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Chaumond</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Delangue</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Moi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Cistac</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Rault</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Louf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Funtowicz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Davison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shleifer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Von Platen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jernite</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Plu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">L</forename><surname>Scao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gugger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Drame</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Lhoest</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Rush</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.1910.03771</idno>
		<idno type="arXiv">arXiv:1910.03771</idno>
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b43">
	<monogr>
		<title level="m" type="main">Trajectories of PTSD following traumatic events: A systematic and multi-database review, PROSPERO</title>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Coimbra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Evenhuis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Lombaers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Van Zuiden</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Grandfield</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Teijema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Neeleman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Jalsovec</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b44">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>De Bruin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Ferdinands</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Teijema</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<idno type="DOI">10.34894/HE6NAQ</idno>
		<title level="m">SYNERGY -Open machine learning dataset on study selection in systematic reviews</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b45">
	<analytic>
		<title level="a" type="main">Bayesian PTSD-Trajectory Analysis with Informed Priors Based on a Systematic Literature Search and Expert Elicitation</title>
		<author>
			<persName><forename type="first">R</forename><surname>Van De Schoot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Sijbrandij</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Depaoli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">D</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Olff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">E</forename><surname>Van Loey</surname></persName>
		</author>
		<idno type="DOI">10.1080/00273171.2017.1412293</idno>
	</analytic>
	<monogr>
		<title level="j">Multivariate Behavioral Research</title>
		<imprint>
			<biblScope unit="volume">53</biblScope>
			<biblScope unit="page" from="267" to="291" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b46">
	<analytic>
		<title level="a" type="main">Gender-based violence and its association with mental health among Somali women in a Kenyan refugee camp: A latent class analysis</title>
		<author>
			<persName><forename type="first">M</forename><surname>Hossain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Pearson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mcalpine</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">J</forename><surname>Bacchus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Spangaro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Muthuri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Muuo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Franchi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Hess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bangha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Izugbara</surname></persName>
		</author>
		<idno type="DOI">10.1136/jech-2020-214086</idno>
	</analytic>
	<monogr>
		<title level="j">Journal of Epidemiology and Community Health</title>
		<imprint>
			<biblScope unit="volume">75</biblScope>
			<biblScope unit="page" from="327" to="334" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b47">
	<analytic>
		<title level="a" type="main">Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks</title>
		<author>
			<persName><forename type="first">N</forename><surname>Reimers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Gurevych</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/D19-1410</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">K</forename><surname>Inui</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Jiang</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">V</forename><surname>Ng</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">X</forename><surname>Wan</surname></persName>
		</editor>
		<meeting>the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), Association for Computational Linguistics<address><addrLine>Hong Kong, China</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="3982" to="3992" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b48">
	<analytic>
		<title level="a" type="main">Active neural learners for text with dual supervision</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">Shama</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">E</forename><surname>Milios</surname></persName>
		</author>
		<idno type="DOI">10.1007/s00521-019-04681-0</idno>
	</analytic>
	<monogr>
		<title level="j">Neural Computing and Applications</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="13343" to="13362" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
