<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Evaluating Retrieval-Augmented Generation for Question Answering with Large Language Models</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Ermelinda</forename><surname>Oro</surname></persName>
							<email>ermelinda.oro@icar.cnr.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">National Research Council</orgName>
								<orgName type="department" key="dep2">Institute for High Performance Computing and Networking</orgName>
								<address>
									<addrLine>via P. Bucci 8/9C, (CS)</addrLine>
									<postCode>87036</postCode>
									<settlement>Rende</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Francesco</forename><forename type="middle">Maria</forename><surname>Granata</surname></persName>
							<email>francesco.granata@altiliagroup.com</email>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Antonio</forename><surname>Lanza</surname></persName>
							<email>antonio.lanza@altiliagroup.com</email>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Amir</forename><surname>Bachir</surname></persName>
							<email>amir.bachir@altiliagroup.com</email>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Luca</forename><surname>De Grandis</surname></persName>
							<email>luca.degrandis@altiliagroup.com</email>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Massimo</forename><surname>Ruffolo</surname></persName>
							<email>massimo.ruffolo@altiliagroup.com</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">National Research Council</orgName>
								<orgName type="department" key="dep2">Institute for High Performance Computing and Networking</orgName>
								<address>
									<addrLine>via P. Bucci 8/9C, (CS)</addrLine>
									<postCode>87036</postCode>
									<settlement>Rende</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department" key="dep1">Altilia srl</orgName>
								<orgName type="department" key="dep2">TechNest Start-up Incubator</orgName>
								<orgName type="institution">University of Calabria</orgName>
								<address>
									<addrLine>Piazza Vermicelli, Rende (CS)</addrLine>
									<postCode>87036</postCode>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Evaluating Retrieval-Augmented Generation for Question Answering with Large Language Models</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">9E5D2F1F25DD3EA6A75A5BF24579E3B9</idno>
					<idno type="arXiv">arXiv:2402.01383.</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T16:56+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Retrieval Augmented Generation (RAG), Question Answering (QA), Retrieval, Large Language Model (LLM), Evaluation M. Ruffolo) 0000-0002-5529-1007 (E. Oro)</term>
					<term>0000-0003-4425-753X (F. M. Granata)</term>
					<term>0000-0002-2875-4133 (L. D. Grandis)</term>
					<term>0000-0002-4094-4810 (M. Ruffolo)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>We present a comprehensive framework for evaluating retrieval-augmented generation (RAG) systems designed for questionanswering tasks using large language models (LLMs). The proposed framework integrates document ingestion, information retrieval, answer generation, and evaluation phases. Both ground truth-based and reference-free evaluation metrics are implemented to provide a multi-faceted assessment approach. Through experiments across diverse datasets like NarrativeQA and a proprietary financial dataset (FinAM-it), the reliability of existing metrics is investigated by comparing them against rigorous human evaluations. The results demonstrate that ground truth-based metrics such as BEM and RAGAS Answer Correctness exhibit a moderately strong correlation with human judgments. However, reference-free metrics still struggle to capture nuances in answer quality without predefined correct responses accurately. An in-depth analysis of Spearman correlation coefficients sheds light on the interrelationships and relative effectiveness of various evaluation approaches across multiple domains. While highlighting the current limitations of reference-free methodologies, the study underscores the need for more sophisticated techniques to better approximate human perception of answer relevance and correctness. Overall, this research contributes to ongoing efforts in developing reliable evaluation frameworks for RAG systems, paving the way for advancements in natural language processing and the realization of highly accurate and human-like AI systems.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Retrieval-Augmented Generation (RAG) systems, which integrate information retrieval with natural language generation, have shown promise in enhancing language models' capabilities. However, evaluating their performance remains challenging, particularly when ground truth data is unavailable, impeding accurate assessments of system utility. To address this challenge, we present a comprehensive framework designed to facilitate the rigorous evaluation of RAG systems for question-answering tasks. Our framework integrates document ingestion, retrieval, generation, and evaluation phases, leveraging state-of-the-art technologies to optimize accuracy and relevance. We implement both ground truth-based and reference-free evaluation metrics, providing a multifaceted approach to assessing system outputs. Through an extensive series of experiments spanning diverse domains and datasets we investigate the reliability and validity of existing evaluation methodologies. Specifically, we examine the correlation between various metrics and rigorous human evaluations, shedding light on their strengths, limitations, and potential for improvement. Our findings reveal that while ground truth-based metrics like BEM and RAG Answer Correctness exhibit moderate alignment with human judgments, referencefree metrics still struggle to accurately capture answer quality nuances without predefined correct responses. By analyzing Spearman correlation coefficients, we elucidate the interrelationships and relative effectiveness of different evaluation approaches across multiple domains.</p><p>This research makes the following key contributions: (i) presenting a comprehensive framework for evaluating RAG systems with state-of-the-art components, (ii) implementing and comparing diverse ground truth-based and reference-free evaluation metrics, (iii) conducting rigorous experiments across multiple datasets to assess metric reliability against human judgments, and (iv) analyzing the strengths and limitations of existing metrics, highlighting the need for advanced reference-free evaluation techniques that better approximate human perception.</p><p>The rest of the paper is organized as follows: Section 2 discusses related work. Section 3 presents the method. Section 4 shows the experimental evaluation and Section 5 concludes the work. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related Work</head><p>RAG systems have been implemented in various forms <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2,</ref><ref type="bibr" target="#b2">3,</ref><ref type="bibr" target="#b3">4,</ref><ref type="bibr" target="#b4">5]</ref>, incorporating advanced strategies like document splitting, chunking, retrieval, and diverse models for embedding and language generation, including proprietary and open-source models from platforms like HuggingFace<ref type="foot" target="#foot_0">1</ref> . We have also explored different variants of RAG systems, however, this paper's primary focus is not to introduce a novel RAG system or methodology but to comprehensively evaluate the effectiveness of Large Language Model (LLM)-derived metrics, emphasizing reference-free approaches.</p><p>Several prior works have proposed frameworks and novel metrics that leverage the capabilities of LLMs <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8,</ref><ref type="bibr" target="#b8">9,</ref><ref type="bibr" target="#b9">10,</ref><ref type="bibr" target="#b10">11]</ref>. Unlike these existing solutions, which aim to score different RAG systems or propose new evaluation methods, metrics, or datasets, our research is specifically targeted at evaluating the potential satisfaction of end-user customers who receive the evaluation scores generated by such systems.</p><p>By concentrating on the practical utility and interpretability of evaluation metrics from the perspective of end-users, our study diverges from the conventional approach of optimizing technical performance alone. Instead, we strive to bridge the gap between state-of-the-art evaluation techniques and the real-world expectations of customers who rely on these systems for decision-making and information retrieval.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Method</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Framework for RAG and evaluation</head><p>This paper introduces a framework for running and evaluating a RAG system for efficiently processing and responding to natural language queries. The system integrates state-of-the-art technologies to enhance answer accuracy and relevance. The process is segmented into four main phases: Ingestion: Input documents are processed into manageable chunks, leveraging techniques like document layout analysis for PDFs. The chunks are embedded into high-dimensional vectors capturing their semantic essence and ingested into a vector store for efficient similarity search. Retrieval: Upon receiving a query, its vector form undergoes similarity search in the vector store to identify the 𝑘 most relevant chunks. This narrows down the information to the most pertinent chunks for answer generation. Generation: A Large Language Model (LLM) synthesizes information from the retrieved chunks to construct a coherent and natural-sounding answer to the query. Evaluation: A two-sided approach employs both ground-truth dependent and independent metrics. Ground-truth dependent metrics assess correctness against predefined answers, while ground-truth independent metrics evaluate answer relevance without a predefined set. This dual approach enables a comprehensive assessment of performance, correctness, and overall text quality. The system can receive human evaluations of question-answer pairs to evaluate metric reliability and alignment with expectations.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Evaluation Strategies</head><p>In our RAG system, we implemented and tested a wide range of evaluation metrics. Specifically, our system incorporates metrics for assessing individual RAG components like Information Retrieval (IR) and Answer Generation, as well as the overall pipeline. For IR, we used classical metrics such as Recall@K, Precision@K, mAP, MRR, and nDCG. For answer generation, the implemented metrics were divided into two categories: Syntactic metrics evaluate formal response aspects, including BLEU <ref type="bibr" target="#b11">[12]</ref>, ROUGE <ref type="bibr" target="#b12">[13]</ref>, Precision, Recall, F1, and Exact Match <ref type="bibr" target="#b13">[14]</ref>. These focus on text properties rather than semantic meaning. Semantic metrics evaluate response meaning, including BERT score <ref type="bibr" target="#b14">[15]</ref> and BEM score <ref type="bibr" target="#b15">[16]</ref>. BEM is preferred over BERT due to reported correlation with human evaluations and our empirical findings. LMMderived Metrics: We implemented in our framework the RAG triad of metrics for the three main steps of an RAG's execution <ref type="bibr" target="#b5">[6]</ref>: (i) Context relevance that assesses if the passage returned is relevant for answering the given query. (ii) Groundedness that assesses if the generated answer is faithful to the retrieved passage or if it contains hallucinated or extrapolated statements beyond the passage. (iii) Answer relevance that assesses if the generated answer is relevant given the query and retrieved passage. In addition, we implemented the Answer correctness that exploits LLMs and gold answers to measure the factual correctness of an answer. In this paper, only a subset of metrics are considered and compared for assessing the quality of the answers (see Section 4.2).</p><p>Manual evaluation. To verify the reliability of automated evaluation metrics, we implemented a rigorous manual evaluation process to assess the relevance, accuracy, and coherence of the answers generated by our RAG system. This manual evaluation was conducted by three independent human annotators, each with expertise in the domain of the questions posed to the system. For each evaluation session, the annotators were presented with the question, the corresponding answer generated by the RAG system, and the ground truth provided by the original dataset or the customer answers. The primary task for each annotator was to assess the quality of the generated answer in relation to the posed question, employing a discrete scoring 5-point likert scale. The criteria for scoring were as follows: 1. Very Poor: The generated answer is totally incorrect or irrelevant to the question. This case indicates a failure of the system to comprehend the query or retrieve pertinent information. 2. Poor: The generated answer is predominantly incorrect but with glimpses of relevance suggesting some level of understanding or appropriate retrieval. 3. Neither: The generated answer mixes relevant and irrelevant information almost equally, showcasing the system's partial success in addressing the query. 4. Good: The generated answer is largely correct but includes minor inaccuracies or irrelevant details, demonstrating a strong understanding and response to the question. 5. Very Good: Reserved for answers that are completely correct and fully relevant, reflecting an ideal outcome where the system accurately understood and responded to the query. The annotators conducted their assessments independently to ensure unbiased evaluations. Upon completion, the scores for each question-answer pair were collected and compared. In cases of discrepancy, a consensus discussion was initiated among the annotators to agree on the most accurate score. This consensus process allowed for mitigating individual bias and considering different perspectives in evaluating the quality of the generated answers. This manual evaluation process helps particularly in assessing the reliability and validity of our system's automated evaluation metrics. By comparing the human-generated scores against the results produced by these automated measures, we can determine the extent to which the automatic metrics accurately reflect human judgment and perception of answer quality.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experiments</head><p>Considering different domains (Section 4.1), we investigate the reliability of a subset of existing metrics (Section 4.2) for evaluating a RAG system (Section 3.1). We explore the feasibility of adopting reference-free metrics and the correlation among them and the human evaluation (Section 3.2).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Datasets</head><p>NarrativeQA -English. A subsample of the Narra-tiveQA dataset <ref type="bibr" target="#b16">[17]</ref> was used, with 50 book-related and 50 movie script-related questions (1% of the test set), spanning 41 unique books and 42 unique movie scripts. This allowed evaluating the RAG system's performance across two distinct narrative content types.</p><p>Financial Asset Management -Italian. The FinAMit dataset, created by Altilia, consists of 50 questionanswer pairs from Italian asset management documents on topics like investment strategies, risk management, and regulatory compliance. The questions are complex and diverse, often requiring information from multiple paragraphs, with detailed, conversational-style answers. In this paper we focus on evaluating the generated answer's quality of the entire pipeline.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Metrics</head><p>In our analysis, we considered the BEM score (BERT matching score) <ref type="bibr" target="#b14">[15]</ref>, which we experimented is the most satisfying among the classic metrics. It is a metric that uses a BERT model <ref type="bibr" target="#b17">[18]</ref> trained to solve an answer equivalence task, this task is solved by training a classifier that tells if two given answers are equivalent and returns the equivalence score. We use the variation of the BERT score Answers and questions that exploits the two answers and the question as model input. This variation results in performing better <ref type="bibr" target="#b15">[16]</ref>.</p><p>In addition, we considered novel LLM-derived metrics developed in the RAGAS <ref type="bibr" target="#b5">[6]</ref> and Truelens 2 systems. These metrics offer evaluations both ground truth-based and reference-free. In particular, from RAGAS we used the two main metrics that focus on answers: Answer Correctness and Answer Relevance. More in detail: (i) Answer Correctness 3 : This metric measures the factual correctness of an answer and needs the presence of a ground truth. It employs an LLM to extract factual statements from both the predicted answer and the ground truth labeling them as True Positives if are present in both the answers, False Negatives if are present only in the ground truth, and False Positives if they are present only in the prediction. Then a final F1 score is calculated, this score in the range (0, 1) is the Answer Correctness. (ii) Answer Relevance<ref type="foot" target="#foot_3">4</ref> : This metric measures how pertinent the generated answer is to the prompt given to the LLM in the generation step. It computes a score in the range (0, 1) as the mean of the cosine similarities between the original question and a set of artificial questions generated by an LLM on the basis of the predicted answer and the given context. The formula of the score is the following: 𝐴𝑛𝑠𝑤𝑒𝑟𝑅𝑒𝑙𝑒𝑣𝑎𝑛𝑐𝑒 = 1 𝑁 ∑︀ 𝑁 𝑖=1 𝑐𝑜𝑠𝑖𝑛𝑒(𝐸𝑜, 𝐸𝑔 𝑖 ) where 𝐸𝑜 is the embedding of the original generated answer and 𝐸𝑔 𝑖 is the embedding of the i-th generated question. From TruLens we used the implemented Answer Relevance metric that prompts an LLM to evaluate the relevance of the answer with respect to the input prompt that includes context and question. The score that the LLM assigns to each answer is in the range (0, 1).</p><p>To study the interrelationships and relative effectiveness among various evaluation metrics, we exploit the Spearman correlation coefficient. The Spearman Rank Correlation <ref type="bibr" target="#b18">[19]</ref> is a non-parametric measure that assesses the statistical dependence between the rankings of two variables. It tells how well the relationship between these variables can be described using a monotonic function. This measure is computed on ranked data, allowing for the analysis of both ordinal variables and continuous variables that have been converted into ranks. The Spearman Rank Correlation coefficient is denoted by 𝜌, and its value ranges from −1 to 1 inclusive, where 1 indicates perfect positive correlation, 0 indicates no correlation, and −1 indicates perfect negative correlation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Settings</head><p>For this implementation, we employed OpenAI models for the embedding, retrieval, and generation stages of the RAG and to implement evaluations with RAGAS and TruLens. The Ingestion step produced chunks of 1024 characters, balancing semantic integrity with avoiding irrelevant or redundant information. Larger chunks may capture more context but increase noise, while smaller sizes may sacrifice contextual information. These chunks were embedded using OpenAI's text-embedding-ada-002 <ref type="foot" target="#foot_4">5</ref> , a state-of-the-art transformer model for generating highquality text embeddings. For retrieval within the vector store, the system identified the 10 most similar embeddings to previously indexed chunks. During generation, we employed the GPT-4-Turbo model<ref type="foot" target="#foot_5">6</ref> with the following prompt structure:</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>You a r e a c h a t b o t h a v i n g a c o n v e r s a t i o n w i t h a human . Given t h e f o l l o w i n g e x t r a c t e d p a r t s</head><p>o f a l o n g document and a q u e s t i o n , c r e a t e a f i n a l answer . I f you don ' t know t h e answer , j u s t s a y t h a t you don ' t know , don ' t t r y t o make up an answer . C o n t e x t : { CONTEXT } Chat h i s t o r y : { CHAT_HISTORY } Human : { HUMAN_INPUT } C h a t b o t :</p><p>This prompt provided the model with instructions, context, and encouraged concise, truthful answers without fabrication.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.">Results</head><p>For both books and movies subsamples from the Nar-rativeQA dataset, as can be seen in table 2 and table 3, human judgment shows a moderately strong Spearman correlation with BEM (0.735 and 0.704) and AC RAGAS scores across both GPT-3.5-turbo (0.718, 0.792), and GPT-4-turbo models (0.67 and 0.781). This indicates that these ground truth-based metrics are more aligned with human perception of answer quality. Reference-free metrics show poor correlation with human judgment, especially AR RAGAS (0.234 and 0.483), highlighting the fact that evaluating an answer without ground truth is still a challenging problem for Large Language Models. The analysis of the FinAM-it dataset as it can be seen in table <ref type="table" target="#tab_3">4</ref> shows generally lower correlations across all metrics, with the highest correlation being observed between human judgment and AC RAGAS gpt-4-turbo (0.531). This could be related to the fact that the FinAM-it dataset presents more challenging and diverse content that is more difficult to evaluate. Extending the analysis on all the datasets at once, it can be seen that all the metrics have still difficulties to approximate the human evaluation in a robust and reliable way.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusion</head><p>Our exploration into evaluating Retrieval Augmented Generation (RAG) systems via ground truth-based and reference-free metrics was driven by the need for reliable evaluation frameworks, particularly for scenarios lacking ground truth data. Our evaluation framework's implementation has demonstrated its potential for facilitating a more comprehensive understanding of these systems' capabilities in such situations. Through rigorous experimentation across different domains and datasets, including NarrativeQA and a specialized industrial dataset, we  compared various evaluation methodologies against human judgment. While ground truth-based metrics like BEM and AC RAGAS showed moderate to strong correlation with human judgments across different domains and models, reference-free metrics still face significant challenges in achieving similar correlation levels. This highlights the current limitations of automated metrics in cap-turing nuanced aspects of human judgment, suggesting an urgent need for further refinement of reference-free evaluation methods. The Spearman correlation analysis reveals that while some metrics align more closely with human assessments, there is still significant room for improvement, especially for more challenging and diverse content like the FinAM-it dataset. These findings under-  score the complexity of accurately evaluating RAG systems and the importance of considering domain-specific factors in metric development and selection. The observed limitations can have practical consequences, such as inaccurate system performance assessments, leading to suboptimal deployment decisions and reduced user satisfaction. Looking forward, our study emphasizes developing more nuanced and sophisticated evaluation frameworks that can better approximate human judgment. This entails improving existing metrics' accuracy and reliability and exploring new methodologies to effectively capture qualitative aspects of generated answers. While our evaluation framework provides valuable insights, we acknowledge several limitations: (i) Current reference-free metrics still struggle to match human judgment, necessitating further refinement. (ii) Metric performance suffers for challenging, domain-specific datasets, highlighting the need for domain-aware or adaptive approaches. (iii) Our analysis covered a subset of available metrics; exploring a wider range, including leveraging advanced LLMs and additional context, is needed. (iv) Results should be validated across different RAG configurations and domains for broader applicability. (v) Despite rigorous human evaluation, inherent subjectivity and potential biases may have impacted findings. We view these limitations as opportunities to contribute to developing more reliable, accurate, and human-like evaluation frameworks that can drive advancements in natural language processing capabilities and the realization of highly effective RAG systems across diverse domains.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: The simplified figure of the implemented RAG System.</figDesc><graphic coords="2,89.29,84.19,203.35,105.07" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Naming and classification of metrics shown in the experimental evaluation</figDesc><table><row><cell>Acronym</cell><cell>Name -Framework</cell><cell>Type</cell></row><row><cell>BEM</cell><cell cols="2">BEM score -TensorFlow GT-based</cell></row><row><cell>AR TruLens</cell><cell>Answer Relevance -TruLens</cell><cell>GT-free</cell></row><row><cell>AR RAGAS</cell><cell>Answer Relevance -RAGAS</cell><cell>GT-free</cell></row><row><cell>AC RAGAS</cell><cell cols="2">Answer Correctness -RAGAS GT-based</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Spearman correlations on NarrativeQA books subsample</figDesc><table><row><cell>Metrics</cell><cell>Human Judgement</cell><cell>BEM</cell><cell>AR TruLens gpt-3.5-turbo</cell><cell>AR RAGAS gpt-3.5-turbo</cell><cell>AC RAGAS gpt-3.5-turbo</cell><cell>AR TruLens gpt-4-turbo</cell><cell>AR RAGAS gpt-4-turbo</cell><cell>AC RAGAS gpt-4-turbo</cell></row><row><cell>Human Judgement</cell><cell>1.000</cell><cell>0.735</cell><cell>0.436</cell><cell>0.234</cell><cell>0.718</cell><cell>0.420</cell><cell>0.150</cell><cell>0.670</cell></row><row><cell>BEM</cell><cell>0.735</cell><cell>1.000</cell><cell>0.185</cell><cell>0.224</cell><cell>0.740</cell><cell>0.405</cell><cell>-0.026</cell><cell>0.713</cell></row><row><cell>AR TruLens gpt-3.5-turbo</cell><cell>0.436</cell><cell>0.185</cell><cell>1.000</cell><cell>0.197</cell><cell>0.274</cell><cell>0.477</cell><cell>0.178</cell><cell>0.224</cell></row><row><cell>AR RAGAS gpt-3.5-turbo</cell><cell>0.234</cell><cell>0.224</cell><cell>0.197</cell><cell>1.000</cell><cell>0.129</cell><cell>0.156</cell><cell>0.633</cell><cell>0.121</cell></row><row><cell>AC RAGAS gpt-3.5-turbo</cell><cell>0.718</cell><cell>0.740</cell><cell>0.274</cell><cell>0.129</cell><cell>1.000</cell><cell>0.238</cell><cell>0.093</cell><cell>0.854</cell></row><row><cell>AR TruLens gpt-4-turbo</cell><cell>0.420</cell><cell>0.405</cell><cell>0.477</cell><cell>0.156</cell><cell>0.238</cell><cell>1.000</cell><cell>0.122</cell><cell>0.108</cell></row><row><cell>AR RAGAS gpt-4-turbo</cell><cell cols="2">0.150 -0.026</cell><cell>0.178</cell><cell>0.633</cell><cell>0.093</cell><cell>0.122</cell><cell>1.000</cell><cell>0.097</cell></row><row><cell>AC RAGAS gpt-4-turbo</cell><cell>0.670</cell><cell>0.713</cell><cell>0.224</cell><cell>0.121</cell><cell>0.854</cell><cell>0.108</cell><cell>0.097</cell><cell>1.000</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>Spearman correlations on NarrativeQA movies subsample</figDesc><table><row><cell>Metrics</cell><cell>Human Judgement</cell><cell>BEM</cell><cell>AR TruLens gpt-3.5-turbo</cell><cell>AR RAGAS gpt-3.5-turbo</cell><cell>AC RAGAS gpt-3.5-turbo</cell><cell>AR TruLens gpt-4-turbo</cell><cell>AR RAGAS gpt-4-turbo</cell><cell>AC RAGAS gpt-4-turbo</cell></row><row><cell>Human Judgement</cell><cell cols="2">1.000 0.704</cell><cell>0.565</cell><cell>0.483</cell><cell>0.792</cell><cell>0.213</cell><cell>0.411</cell><cell>0.781</cell></row><row><cell>BEM</cell><cell>0.704</cell><cell>1.000</cell><cell>0.522</cell><cell>0.428</cell><cell>0.752</cell><cell>0.235</cell><cell>0.358</cell><cell>0.746</cell></row><row><cell>AR TruLens gpt-3.5-turbo</cell><cell>0.565</cell><cell>0.522</cell><cell>1.000</cell><cell>0.390</cell><cell>0.476</cell><cell>0.270</cell><cell>0.422</cell><cell>0.473</cell></row><row><cell>AR RAGAS gpt-3.5-turbo</cell><cell>0.483</cell><cell>0.428</cell><cell>0.390</cell><cell>1.000</cell><cell>0.403</cell><cell>0.406</cell><cell>0.738</cell><cell>0.421</cell></row><row><cell>AC RAGAS gpt-3.5-turbo</cell><cell cols="2">0.792 0.752</cell><cell>0.476</cell><cell>0.403</cell><cell>1.000</cell><cell>0.228</cell><cell>0.358</cell><cell>0.977</cell></row><row><cell>AR TruLens gpt-4-turbo</cell><cell>0.213</cell><cell>0.235</cell><cell>0.270</cell><cell>0.406</cell><cell>0.228</cell><cell>1.000</cell><cell>0.456</cell><cell>0.200</cell></row><row><cell>AR RAGAS gpt-4-turbo</cell><cell>0.411</cell><cell>0.358</cell><cell>0.422</cell><cell>0.738</cell><cell>0.358</cell><cell>0.456</cell><cell>1.000</cell><cell>0.379</cell></row><row><cell>AC RAGAS gpt-4-turbo</cell><cell cols="2">0.781 0.746</cell><cell>0.473</cell><cell>0.421</cell><cell>0.977</cell><cell>0.200</cell><cell>0.379</cell><cell>1.000</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4</head><label>4</label><figDesc>Spearman correlations on FinAM-it dataset</figDesc><table><row><cell>Metrics</cell><cell>Human Judgement</cell><cell>BEM</cell><cell>AR TruLens gpt-3.5-turbo</cell><cell>AR RAGAS gpt-3.5-turbo</cell><cell>AC RAGAS gpt-3.5-turbo</cell><cell>AR TruLens gpt-4-turbo</cell><cell>AR RAGAS gpt-4-turbo</cell><cell>AC RAGAS gpt-4-turbo</cell></row><row><cell>Human Judgement</cell><cell cols="2">1.000 0.208</cell><cell>0.178</cell><cell>0.153</cell><cell>0.053</cell><cell>0.280</cell><cell>0.230</cell><cell>0.531</cell></row><row><cell>BEM</cell><cell cols="2">0.208 1.000</cell><cell>0.214</cell><cell>0.209</cell><cell>0.276</cell><cell>0.001</cell><cell>0.203</cell><cell>0.278</cell></row><row><cell>AR TruLens gpt-3.5-turbo</cell><cell cols="2">0.178 0.214</cell><cell>1.000</cell><cell>0.412</cell><cell>0.433</cell><cell>0.181</cell><cell>0.446</cell><cell>0.300</cell></row><row><cell>AR RAGAS gpt-3.5-turbo</cell><cell cols="2">0.153 0.209</cell><cell>0.412</cell><cell>1.000</cell><cell>0.463</cell><cell>-0.191</cell><cell>0.608</cell><cell>0.130</cell></row><row><cell>AC RAGAS gpt-3.5-turbo</cell><cell cols="2">0.053 0.276</cell><cell>0.433</cell><cell>0.463</cell><cell>1.000</cell><cell>-0.099</cell><cell>0.243</cell><cell>0.255</cell></row><row><cell>AR TruLens gpt-4-turbo</cell><cell cols="2">0.280 0.001</cell><cell>0.181</cell><cell>-0.191</cell><cell>-0.099</cell><cell>1.000</cell><cell>-0.009</cell><cell>0.245</cell></row><row><cell>AR RAGAS gpt-4-turbo</cell><cell cols="2">0.230 0.203</cell><cell>0.446</cell><cell>0.608</cell><cell>0.243</cell><cell>-0.009</cell><cell>1.000</cell><cell>0.157</cell></row><row><cell>AC RAGAS gpt-4-turbo</cell><cell cols="2">0.531 0.278</cell><cell>0.300</cell><cell>0.130</cell><cell>0.255</cell><cell>0.245</cell><cell>0.157</cell><cell>1.000</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 5</head><label>5</label><figDesc>Spearman correlations on all datasets</figDesc><table><row><cell>Metrics</cell><cell>Human Judgement</cell><cell>BEM</cell><cell>AR TruLens gpt-3.5-turbo</cell><cell>AR RAGAS gpt-3.5-turbo</cell><cell>AC RAGAS gpt-3.5-turbo</cell><cell>AR TruLens gpt-4-turbo</cell><cell>AR RAGAS gpt-4-turbo</cell><cell>AC RAGAS gpt-4-turbo</cell></row><row><cell>Human Judgement</cell><cell cols="2">1.000 0.627</cell><cell>0.423</cell><cell>0.323</cell><cell>0.536</cell><cell>0.314</cell><cell>0.287</cell><cell>0.653</cell></row><row><cell>BEM</cell><cell>0.627</cell><cell>1.000</cell><cell>0.310</cell><cell>0.266</cell><cell>0.654</cell><cell>0.249</cell><cell>0.155</cell><cell>0.711</cell></row><row><cell>AR TruLens gpt-3.5-turbo</cell><cell>0.423</cell><cell>0.310</cell><cell>1.000</cell><cell>0.346</cell><cell>0.303</cell><cell>0.302</cell><cell>0.375</cell><cell>0.302</cell></row><row><cell>AR RAGAS gpt-3.5-turbo</cell><cell>0.323</cell><cell>0.266</cell><cell>0.346</cell><cell>1.000</cell><cell>0.213</cell><cell>0.201</cell><cell>0.682</cell><cell>0.198</cell></row><row><cell>AC RAGAS gpt-3.5-turbo</cell><cell cols="2">0.536 0.654</cell><cell>0.303</cell><cell>0.213</cell><cell>1.000</cell><cell>0.208</cell><cell>0.139</cell><cell>0.813</cell></row><row><cell>AR TruLens gpt-4-turbo</cell><cell>0.314</cell><cell>0.249</cell><cell>0.302</cell><cell>0.201</cell><cell>0.208</cell><cell>1.000</cell><cell>0.250</cell><cell>0.187</cell></row><row><cell>AR RAGAS gpt-4-turbo</cell><cell>0.287</cell><cell>0.155</cell><cell>0.375</cell><cell>0.682</cell><cell>0.139</cell><cell>0.250</cell><cell>1.000</cell><cell>0.169</cell></row><row><cell>AC RAGAS gpt-4-turbo</cell><cell cols="2">0.653 0.711</cell><cell>0.302</cell><cell>0.198</cell><cell>0.813</cell><cell>0.187</cell><cell>0.169</cell><cell>1.000</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">https://huggingface.co/spaces/HuggingFaceH4/open_llm_ leaderboard</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">https://www.trulens.org/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">https://docs.ragas.io/en/latest/concepts/metrics/answer_ correctness.html</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_3">https://docs.ragas.io/en/latest/concepts/metrics/answer_ relevance.html</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_4">https://openai.com/blog/new-and-improved-embedding-model</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_5">https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo</note>
		</body>
		<back>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Retrieval augmented language model pre-training</title>
		<author>
			<persName><forename type="first">K</forename><surname>Guu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Tung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Pasupat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chang</surname></persName>
		</author>
		<ptr target="http://proceedings.mlr.press/v119/guu20a.html" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of Machine Learning Research</title>
				<meeting>Machine Learning Research<address><addrLine>PMLR</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2020-07-18">13-18 July 2020. 2020</date>
			<biblScope unit="volume">119</biblScope>
			<biblScope unit="page" from="3929" to="3938" />
		</imprint>
	</monogr>
	<note>Virtual Event</note>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<author>
			<persName><forename type="first">O</forename><surname>Khattab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Potts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zaharia</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2007.00814</idno>
		<title level="m">Relevanceguided supervision for openqa with colbert</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<author>
			<persName><forename type="first">K</forename><surname>Shuster</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Poff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kiela</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Weston</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2104.07567</idno>
		<title level="m">Retrieval augmentation reduces hallucination in conversation</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Retrieving supporting evidence for generative question answering</title>
		<author>
			<persName><forename type="first">S</forename><surname>Huo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Arabzadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Clarke</surname></persName>
		</author>
		<idno type="DOI">10.1145/3624918.3625336</idno>
		<idno>doi:10. 1145/3624918.3625336</idno>
		<ptr target="http://dx.doi.org/10.1145/3624918.3625336" />
	</analytic>
	<monogr>
		<title level="m">SIGIR-AP</title>
				<imprint>
			<publisher>ACM</publisher>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="11" to="20" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">G</forename><surname>Patil</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zaharia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Stoica</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Gonzalez</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2403.10131</idno>
		<title level="m">Raft: Adapting language model to domain specific rag</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Es</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>James</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Espinosa-Anke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Schockaert</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2309.15217</idno>
		<title level="m">Ragas: Automated evaluation of retrieval augmented generation</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Yang</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2401.15391</idno>
		<title level="m">Multihop-rag: Benchmarking retrieval-augmented generation for multi-hop queries</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ruan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Pu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wan</surname></persName>
		</author>
		<title level="m">Llm-based nlg evaluation: Current status and challenges</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Fang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Chen</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2402.16457</idno>
		<title level="m">Retrievalqa: Assessing adaptive retrieval-augmented generation for short-form open-domain question answering</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<author>
			<persName><forename type="first">V</forename><surname>Katranidis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Barany</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2403.03888</idno>
		<title level="m">Faaf: Facts as a function for the evaluation of rag systems</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<title level="m" type="main">Ares: An automated evaluation framework for retrieval-augmented generation systems</title>
		<author>
			<persName><forename type="first">J</forename><surname>Saad-Falcon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Khattab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Potts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zaharia</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2311.09476</idno>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Automatic evaluation of summaries using n-gram co-occurrence statistics</title>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Hovy</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/N03-1020" />
	</analytic>
	<monogr>
		<title level="m">Human Language Technology Conference of the North American Chapter of the ACL</title>
				<imprint>
			<date type="published" when="2003">2003</date>
			<biblScope unit="page" from="150" to="157" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">ROUGE: A package for automatic evaluation of summaries</title>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/W04-1013" />
	</analytic>
	<monogr>
		<title level="m">Text Summarization Branches Out, ACL</title>
				<meeting><address><addrLine>Barcelona, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2004">2004</date>
			<biblScope unit="page" from="74" to="81" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">SQuAD: 100,000+ questions for machine comprehension of text</title>
		<author>
			<persName><forename type="first">P</forename><surname>Rajpurkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lopyrev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liang</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/D16-1264</idno>
		<ptr target="https://aclanthology.org/D16-1264.doi:10.18653/v1/D16-1264" />
	</analytic>
	<monogr>
		<title level="m">EMNLP, ACL</title>
				<editor>
			<persName><forename type="first">J</forename><surname>Su</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Duh</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">X</forename><surname>Carreras</surname></persName>
		</editor>
		<meeting><address><addrLine>Austin, Texas</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="2383" to="2392" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<monogr>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kishore</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">Q</forename><surname>Weinberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Artzi</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1904.09675</idno>
		<title level="m">Bertscore: Evaluating text generation with bert</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<title level="m" type="main">beyond token-level answer equivalence for question answering evaluation</title>
		<author>
			<persName><forename type="first">J</forename><surname>Bulian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Buck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Gajewski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Boerschinger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Schuster</surname></persName>
		</author>
		<author>
			<persName><surname>Tomayto</surname></persName>
		</author>
		<author>
			<persName><surname>Tomahto</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2202.07654</idno>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title level="m" type="main">The narrativeqa reading comprehension challenge</title>
		<author>
			<persName><forename type="first">T</forename><surname>Kočiský</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schwarz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Blunsom</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Dyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">M</forename><surname>Hermann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Melis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Grefenstette</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1712.07040</idno>
		<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Devlin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-W</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Toutanova</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1810.04805</idno>
		<title level="m">Bert: Pre-training of deep bidirectional transformers for language understanding</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<monogr>
		<title level="m" type="main">Pearson&apos;s and Spearman&apos;s Correlation</title>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">F</forename><surname>Weaver</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Morales</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">L</forename><surname>Dunn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Godde</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">F</forename><surname>Weaver</surname></persName>
		</author>
		<idno type="DOI">10.1002/9781119454205.ch10</idno>
		<idno>doi:</idno>
		<ptr target="https://doi.org/10.1002/9781119454205.ch10" />
		<imprint>
			<date type="published" when="2017">2017</date>
			<publisher>John Wiley and Sons, Ltd</publisher>
			<biblScope unit="page" from="435" to="471" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
