<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">A Novel Multi-Step Prompt Approach for LLM-based Q&amp;As on Banking Supervisory Regulation</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Daniele</forename><surname>Licari</surname></persName>
							<email>daniele.licari@bancaditalia.it</email>
							<affiliation key="aff0">
								<orgName type="department">Banca d&apos;Italia</orgName>
								<address>
									<addrLine>Via Nazionale, 91</addrLine>
									<postCode>00184</postCode>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Scuola Superiore Sant&apos;Anna</orgName>
								<address>
									<addrLine>P.zza dei Martiri della Libertà, 33</addrLine>
									<postCode>56100</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Canio</forename><surname>Benedetto</surname></persName>
							<email>canio.benedetto@bancaditalia.it</email>
							<affiliation key="aff0">
								<orgName type="department">Banca d&apos;Italia</orgName>
								<address>
									<addrLine>Via Nazionale, 91</addrLine>
									<postCode>00184</postCode>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Praveen</forename><surname>Bushipaka</surname></persName>
							<email>praveen.bushipaka@santannapisa.it</email>
							<affiliation key="aff1">
								<orgName type="institution">Scuola Superiore Sant&apos;Anna</orgName>
								<address>
									<addrLine>P.zza dei Martiri della Libertà, 33</addrLine>
									<postCode>56100</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Alessandro</forename><surname>De Gregorio</surname></persName>
							<email>alessandro.degregorio@bancaditalia.it</email>
							<affiliation key="aff0">
								<orgName type="department">Banca d&apos;Italia</orgName>
								<address>
									<addrLine>Via Nazionale, 91</addrLine>
									<postCode>00184</postCode>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Marco</forename><surname>De Leonardis</surname></persName>
							<email>marco.deleonardis@bancaditalia.it</email>
							<affiliation key="aff0">
								<orgName type="department">Banca d&apos;Italia</orgName>
								<address>
									<addrLine>Via Nazionale, 91</addrLine>
									<postCode>00184</postCode>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Tommaso</forename><surname>Cucinotta</surname></persName>
							<email>tommaso.cucinotta@santannapisa.it</email>
							<affiliation key="aff1">
								<orgName type="institution">Scuola Superiore Sant&apos;Anna</orgName>
								<address>
									<addrLine>P.zza dei Martiri della Libertà, 33</addrLine>
									<postCode>56100</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff2">
								<orgName type="department">Tenth Italian Conference on Computational Linguistics</orgName>
								<address>
									<addrLine>Dec 04 -06</addrLine>
									<postCode>2024</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">A Novel Multi-Step Prompt Approach for LLM-based Q&amp;As on Banking Supervisory Regulation</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">CAA2D12072E661C3813812268E84F93D</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:37+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Regulatory Q&amp;A, Banking Supervisory Reporting Regulation, Artificial Intelligence, GenAI, GPT-4o, RAG, LLM Evaluator (T. Cucinotta) 0000-0002-2963-9233 (D. Licari)</term>
					<term>0000-0002-8446-9468 (C. Benedetto)</term>
					<term>0009-0009-7753-8662 (P. Bushipaka)</term>
					<term>0000-0001-7577-3655 (A. De Gregorio)</term>
					<term>0009-0004-6523-186X (M. De Leonardis)</term>
					<term>0000-0002-0362-0657 (T. Cucinotta)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This paper investigates the use of large language models (LLMs) in analyzing and answering questions related to banking supervisory regulation concerning reporting obligations. We introduce a multi-step prompt construction method that enhances the context provided to the LLM, resulting in more precise and informative answers. This multi-step approach is compared with standard "zero-shot" and "few-shot" approaches, which lacks context enrichment. To assess the quality of the generated responses, we utilize an LLM evaluator. Our findings indicate that the multi-step approach significantly outperforms the zero-shot method, producing more comprehensive and accurate responses.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>The advent of generative AI (GenAI), and specifically of large language models (LLMs), offers significant opportunities, among others, in the legal and financial sector, facilitating the implementation of innovative solutions across various domains of activities <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2,</ref><ref type="bibr" target="#b2">3,</ref><ref type="bibr" target="#b3">4,</ref><ref type="bibr" target="#b4">5]</ref>. One of the most promising applications is the business case for supporting the navigation and analysis of complex regulatory documents <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8,</ref><ref type="bibr" target="#b8">9]</ref>, which can be particularly valuable for compliance officers, legal teams, and other professionals working in financial institutions who need to have a clear and timely understanding of the regulations and the consequently derived obligations.</p><p>Supervisory authorities could benefit from a tool that streamlines the consultation of complex legislation, providing swift responses to entities and enhancing efficiency <ref type="bibr" target="#b9">[10]</ref>. While LLMs offer advantages for this purpose, they also pose risks like bias and inaccuracies <ref type="bibr" target="#b10">[11]</ref>.</p><p>Therefore, it is essential to establish strong verification procedures and retain human supervision to counter these risks. The complexity of regulatory documents, with their dense network of cross-referenced texts/cats and specialized content, necessitates careful analysis to retrieve the needed information ensuring at the same time effective risk management and limit the burden of such manual compliance.</p><p>This study introduces a novel methodology to automate and expedite the "question &amp; answer" (Q&amp;A) process in regulatory compliance, leveraging advanced large language models (LLMs) to provide accurate and timely responses to inquiries about the European Banking Authority's (EBA) reporting regulations. Our multi-step approach aligns with Retrieval-Augmented Generation (RAG) principles, enhancing context retrieval and generative capabilities through mechanisms like explicit extraction of Capital Requirements Regulation (CRR) references, implicit reference analysis, and a dedicated cross-encoder for precise regulatory text retrieval. This methodology ensures tailored response generation suited to the complex regulatory compliance context, where precise and comprehensive answers are crucial.</p><p>Our work finds particular applications within the domain of EBA regulatory reporting because it is characterized by a large and complex set of interrelated documents, including delegated and implementing acts, technical standards, guidelines, and recommendations, which cover various aspects of financial entities. Such complexity makes the business case both challenging and rewarding.</p><p>In this work, we focus on Regulation (EU) N.2013/575, also called Capital Requirements Regulation (CRR) https://eur-lex.europa.eu/legal-content/en/ALL/?uri= celex%3A32013R0575, specifically on the topic of Liquidity Risk as a first use case to evaluate the potential benefit of enriched context for an accurate response generation. The main reason for this choice is that this topic is supported by a relatively limited number of regulatory documents, so it was a good starting point since the regulation is not readily available in the form of a structured dataset and its pre-processing is usually a time-consuming task.</p><p>We used the actual EBA Q&amp;As dataset <ref type="bibr" target="#b11">[12]</ref> as the foundation for developing a system capable of generating automated responses to questions formulated by analysts on EBA reporting requirements and rules. By harnessing the capabilities of LLMs we aim to create a tool that can deliver accurate and contextually relevant answers to any inquiry on the content of the CRR.</p><p>Recent studies highlight the potential of LLMs for qualitative assessment <ref type="bibr" target="#b12">[13,</ref><ref type="bibr" target="#b13">14,</ref><ref type="bibr" target="#b14">15,</ref><ref type="bibr" target="#b15">16]</ref>. For this reason, in this work we also propose the use of an "LLM Evaluator" to automate the validation process.</p><p>The structure of this paper is the following. Chapter 2 introduces the methodology and provides a detailed description of the approach adopted in this study; it explains the dataset utilized and the normative retrieval techniques employed to identify the regulatory documents necessary to address the EBA's Q&amp;As. Chapter 3 presents the LLM Evaluator and the evaluation criteria. Chapter 4 reports experimental results and results and presents the main outcomes of the study. Chapter 5 discusses challenges as well as potential areas for future developments.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Methodology</head><p>This research employs a multi-step methodology to construct a comprehensive prompt for the GPT-4 omni (GPT-4o) language model <ref type="bibr" target="#b16">[17]</ref>, enabling it to answer EBArelated questions effectively. This step-wise approach focuses on enriching the context provided by the user's question. First, it identifies relevant EBA regulations (specifically CRR references) within the inquiry. Second, it incorporates response examples to guide the LLM's output format ensuring alignment with EBA regulations. This enriched context is then leveraged by a powerful LLM to generate more accurate and informative responses (details in Appendix B.1).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Dataset Construction</head><p>To develop and then evaluate our LLM-based Q&amp;A system, firstly we extracted a subset from the EBA's Singlerule-book-qa online resource <ref type="bibr" target="#b11">[12]</ref>, comprising "questionand-answer" pairs submitted to the EBA between 2013 and 2020. In particular, we focused on the following </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Context Enrichment</head><p>The context enrichment process is a three-step approach designed to identify, within the data set, the most relevant CRR references to provide an appropriate content to formulate the answer to the inquiry. The first step simply involves extracting explicit CRR references, if directly mentioned in the question (Article in tab 4). The second step leverages on the capabilities of the GPT-4o (prompt in Appendix C.1) to analyse the "question" and the "background information" to identify other CRR references that are not explicitly stated by the user. The last step of the process utilizes our CRR Ranker model, a crossencoder architecture that has been trained to identify and retrieve pertinent references from the Capital Requirements Regulation in response to specific inquiries. This 3-steps comprehensive approach ensures a broader and potentially more accurate understanding of the the inquiry and the specific legal act(s) related to the CRR that the Q&amp;A tool deems applicable.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.1.">CRR Ranker Training</head><p>With regard to the context enrichment, i.e. the CRR Ranker Training, we employed a specifically trained cross-encoder model <ref type="bibr" target="#b18">[18]</ref> to identify relevant CRR references for enriching inquiry context. We used a dedicated "question-article" pair dataset derived from our EBA Q&amp;A Train Dataset, excluding questions related to CRR Article 99 https://www.eba.europa.eu/regulation-and-policy/ single-rulebook/interactive-single-rulebook/14212 due to their frequent lack of topical relevance. Each data point consisted of a question (user query and background information), an associated CRR article, and a binary label indicating relevance (1 for relevant, 0 for not applicable).</p><p>We constructed the training dataset by selecting positive and negative samples. Positive samples comprised question-article pairs where the article explicitly addressed the user's query. Additionally, we included pairs formed by questions and implicit CRR references extracted from the user's text, context information, and official response using GPT-4o (used prompt in Appendix C.1).</p><p>Negative training samples were mined by using the BAAI bge-large-en-v1.5 pre-trained language model <ref type="bibr" target="#b19">[19]</ref>. For the CRR Ranker Training we employed a two-phase process for negative sample selection: first, all CRR articles were encoded using the bge-large-en-v1.5 model, and cosine similarity was utilized to rank them relative to the user's question; second, a set of 20 negative examples was randomly chosen from a pre-defined ranking interval (250-300). The choice of 20 negative samples provides a good balance between computational efficiency and the availability of enough training data. This approach aimed to balance the representation of relevant and irrelevant information within the training data, ensuring the model learns to distinguish between the user's query and potentially related but ultimately off-topic CRR articles <ref type="bibr" target="#b20">[20]</ref>.</p><p>The final dataset comprised 12,533 unique "questionarticle" pairs with positive and negative labels. This data was split into training (10,179 pairs) and development (2,354 pairs) sets for model fine-tuning. This fine-tuning aimed to learn robust semantic representations for questions and CRR articles, enabling the model to effectively identify relevant CRR references for enriching user query context.</p><p>We selected the BAAI BGE Reranker v2 m3 model <ref type="bibr" target="#b18">[18]</ref> as the basis for our cross-encoder, owing to its taskspecific aptness and its demonstrated superior performance relative to the BGE Reranker Large <ref type="bibr" target="#b19">[19]</ref>, as reported in Section 4. We adopted the Cross-Entropy Binary Classification loss function, following the approach suggested in the BGE Rerank Git repository <ref type="bibr" target="#b21">[21]</ref>. To promote stable convergence, we incorporated a warmup schedule ( with a number of steps 0.1 × len(train_data) × num_epochs step) that gradually increases the learning rate during the initial phase of training. The entire finetuning process was conducted over 4 epochs. We employed an evaluation interval of 800 steps during training and saved the model that achieved the highest F1 score on the development set.</p><p>Finally, we evaluated the model's retrieval ability of CRR items for a given user question on EBA Q&amp;A Test Dataset. This evaluation employed recall metrics at various retrieval cutoffs, including recall@5, recall@10, recall@20, and recall@30 (results in Section 4).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Examples Enrichment</head><p>To improve the model's understanding of the desired response format, tone, and content, we adopted a few-shot prompting approach <ref type="bibr" target="#b22">[22]</ref>. This involved extracting five relevant examples from the EBA Q&amp;A Train Dataset with the same topic as the user question we want to answer. These examples served as demonstrations for the model, showcasing the ideal structure, language style, and level of detail expected in the final responses. Notably, the selection process ensured heterogeneity within the chosen topic, meaning the examples covered various aspects to promote a broader understanding. Limiting the number of examples to five struck a balance between providing diverse demonstrations and maintaining cost-efficiency during inference, as the LLM's input token length has limitations.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4.">Answer Generation</head><p>Figure <ref type="figure">2</ref> in Appendix B.1 details how we construct a comprehensive prompt that enhances GPT-4o's ability to effectively answer user questions. The final prompt in Appendix C.2 integrates the enriched context (extracted CRR references) and the example enrichment (demonstrations of desired response format, tone, and content). This comprehensive prompt is fed to GPT-4o through the OpenAI API, enabling it to generate a well-reasoned and informative response that adheres to the EBA's regulatory framework and professional tone.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.5.">Comparison with RAG Principles</head><p>Our multi-step prompt approach aligns with the core principles of Retrieval-Augmented Generation (RAG) while incorporating tailored enhancements that improve context enrichment for regulatory Q&amp;A tasks. Like RAG, our method integrates information retrieval with language generation, but it adds specialized steps to enhance context enrichment. These include explicit extraction of CRR references, implicit analysis using LLM capabilities, and precise retrieval through a dedicated cross-encoder. Compared to standard RAG, which often relies on singlestage retrieval, our structured multi-step process adds a higher level of granularity, including example enrichment through few-shot prompts. This ensures not only factual accuracy but also alignment with domain-specific language standards, ultimately improving response quality for complex regulatory inquiries. Overall, our approach extends the RAG principles to generate tailored, contex-tually enriched answers, which is particularly beneficial for the intricate requirements of regulatory compliance.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">LLM Evaluator</head><p>In our pipeline, we employ an LLM Evaluator to assess the quality of generated responses, defined in Section 2, compared to the EBA's answers already provided. Employing an LLM Evaluator offers significant advantages in terms of cost-effectiveness and efficiency compared to traditional human evaluation/comparison methods. Recent research highlights the potential of LLMs for large-scale natural language evaluation tasks <ref type="bibr" target="#b23">[23,</ref><ref type="bibr" target="#b24">24,</ref><ref type="bibr" target="#b25">25]</ref>.</p><p>The evaluation process uses a scale from one to four, based on two evaluation criteria: correctness and completeness. A generated response is considered correct if its content aligns with the information presented in the official answer. Additionally, a response is deemed complete if it incorporates all relevant regulatory references provided in the official answer. The following scoring rubric outlines the evaluation criteria:</p><p>• Score 1: The generated answer is completely incorrect and incomplete compared to the official answer.</p><p>• Score 2: The generated answer is incorrect but either complete or partially complete compared to the official answer. It contains some useful information found in the official answer, but the main statement is incorrect. • Score 3: The generated answer is correct but only partially complete. The main statement matches the official answer, but some information from the official answer is missing. • Score 4: The generated answer is fully correct and complete. It is essentially a rephrased version of the official answer with no significant differences.</p><p>To preliminary validate the effectiveness of our LLM evaluator, we conducted an experiment using a synthetic dataset. This dataset was carefully designed to test various aspects of language generation and was evaluated by both a human expert and the LLM. The alignment between the human expert's assessments and those of the LLM was then analyzed. The complete details of the final prompt used for LLM evaluator are provided in Appendix C. <ref type="bibr" target="#b2">3</ref>.</p><p>The dataset comprises 60 Q&amp;A pairs, balanced across the four score categories. For each category, two pairs were excluded as they were used as examples for the prompt for the LLM evaluator, resulting in a final dataset of 52 Q&amp;A pairs to measure the alignment between the human and LLM evaluator. Using GPT-4o, we obtained a Kendall-tau coefficient of 0.77, with a p-value of 6•10 −11 . These results justified the adoption of the LLM evaluator over a human one, especially for tasks involving prompt optimization and evaluation. The figure in Appendix B.2 illustrates the complete process of evaluating agreement between the LLM evaluator and the human expert.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experiments and Results</head><p>This section describes the results obtained by measuring retrieval effectiveness and answer quality. Retrieval performance is measured by the number of relevant regulations retrieved (recall) using different encoder models. Answer quality is then evaluated by a separate LLM, which scores each generated response based on factors like relevance and adherence to EBA legal acts. We compare the multi-step prompt approach with a few-shot and zero-shot one focusing on a single topic within the EBA Q&amp;A framework, specifically Liquidity Risk. Finally, we test our Multi-Step pipeline with other LLM models, such as Google Gemini Flash 1.5 and Llama 3.1 70B.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">CRR Retrieval</head><p>We employed "recall" as the primary metric to assess the effectiveness of bi and cross encoder models in retrieving relevant CRR articles based on the information submitted with the inquiry. "Recall" signifies the proportion of truly relevant CRR articles retrieved from the dataset compared to all the pertinent actual articles <ref type="bibr" target="#b26">[26]</ref>. In the context of legal information retrieval, prioritizing the retrieval of all crucial regulatory information for the inquiry makes the recall a particularly relevant metric.</p><p>Our primary objective was to identify a model that delivers exceptional retrieval accuracy while maintaining computational efficiency. This potentially excluded models with an extremely large number of parameters, as they can be computationally expensive to run.</p><p>We conducted a performance comparison between our fine-tuned CRR Ranker and several pre-trained models:</p><p>• Bi-encoders: all-MiniLM-L6-v2 <ref type="bibr" target="#b27">[27]</ref>, gte-large-en-v1.5 <ref type="bibr" target="#b28">[28]</ref>, and bge-large-en-v1.5 <ref type="bibr" target="#b19">[19]</ref>. • Cross-encoders: bge-reranker-large <ref type="bibr" target="#b19">[19]</ref>, bgereranker-v2-m3 <ref type="bibr" target="#b29">[29,</ref><ref type="bibr" target="#b18">18]</ref>.</p><p>The detailed results (presented in table 2) show the achieved recall scores on EBA Q&amp;As Test Dataset for each model. Our fine-tuned CRR Ranker significantly outperformed all other models, achieving a more than 20% improvement compared to the best pre-trained model (bge-large-en-v1.5).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Answer Generation</head><p>Here we compare the performance of our multi-step approach with a zero-shot one for answering EBA liquidity We tested:</p><p>• Zero-Shot Approach: for each question, a standard prompt was provided to the LLM. It encompassed both the specific query and any relevant contextual information they provided. • Few-Shot Approach: for each question, a few examples were provided along with the query to guide the LLM in generating responses. • Multi-Step Approach: for each question, we created prompts following our established multistep approach, incorporating context enrichment and example enrichment (as detailed in previous sections).</p><p>The LLM Evaluator assessed each response based on its correctness and completeness relative to the official EBA response. As described in Section 3, the LLM Evaluator assigned an overall score on a scale of 1 (completely incorrect and incomplete) to 4 (fully correct and comprehensive).</p><p>Table <ref type="table" target="#tab_2">3</ref> summarizes the evaluation results for responses generated by the different approaches. The "multi-step" approach consistently achieved higher counts in the high-quality rating categories compared to both the "zero-shot" and "few-shot" ones. This demonstrates that the multi-step approach significantly outperformed the other methods in terms of response quality. The LLM evaluator awarded the multi-step approach an average score of 2.7, representing a 12.5% improvement over the zero-shot and few-shot approaches, which both received an average score of 2.4. Notably, a larger portion of the responses generated by our multi-step approach received scores of 3 or higher, indicating correct answers. In contrast, only 2 out of 46 responses generated by the multi-step approach were rated as completely incorrect (score 1), compared to 6 such responses for the zero-shot approach and 11 for the few-shot approach. These findings suggest that the context enrichment in the multi-step prompts effectively guides the primary LLM toward generating more comprehensive and informative responses that accurately reflect the EBA regulations. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.1.">Other LLMs</head><p>In this section, we extend our analysis of the multi-step pipeline by incorporating evaluations using additional large language models (LLMs), specifically Google Gemini Flash 1.5 and Llama 3.1 70B. Google Gemini Flash 1.5 is widely recognized for its high-speed processing capabilities and efficiency in response generation, making it a suitable benchmark for comparative performance analysis. Conversely, Llama 3.1 70B is noted for its robustness in handling complex queries while maintaining moderate computational demands, providing an interesting contrast in terms of performance and resource efficiency.</p><p>Our experimental results indicate that the average evaluation score achieved by Google Gemini Flash 1.5 was 2.0, whereas Llama 3.1 70B attained an average score of 2.2. Notably, these scores did not surpass the performance of the GPT-4o zero-shot approach, which underscores the advanced capabilities of GPT-4o in addressing the complexities of regulatory compliance inquiries. This observation highlights the inherent strength of GPT-4o in generating accurate and contextually relevant responses, outperforming the other models under similar conditions. Future research will focus on an in-depth analysis of these models with a view toward optimizing each step of the multi-step pipeline in a model-specific manner. By tailoring our methodology to align with the distinctive strengths and limitations of each model, we aim to further enhance the overall accuracy and reliability of the generated responses.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Challenges and Advancements</head><p>Our work has highlighted several key challenges that are worth discussing. One of the primary issues concerns the limited size of our test dataset. This constraint arose because we focused on the single topic of Liquidity Risk. However, to achieve robust human alignment and ensure the system addresses diverse user inquiries across EBA topics, future efforts should prioritize dataset expansion and human evaluation integration.</p><p>Another topic for reflection is that the study emphasizes the need to retrieve relevant CRR articles. Future research could investigate methods to further refine the generated responses by incorporating legal reasoning and argumentation capabilities into the LLM <ref type="bibr" target="#b30">[30,</ref><ref type="bibr" target="#b31">31]</ref>, and the most relevant Q&amp;As as examples for few-shot prompting <ref type="bibr" target="#b5">[6]</ref>.</p><p>It is also crucial to underscore the importance of optimizing prompts for this kind of application, and we plan to address this moving forward. Our future research endeavors will focus on investigating automatic prompt engineering techniques <ref type="bibr" target="#b32">[32]</ref> leveraging the LLM Evaluator as a metric to optimize. These techniques aim to tailor and optimize prompts based on the specific topic of inquiries, enhancing overall performance.</p><p>Moreover, currently we have utilized only one model, GPT-4o, but we intend to extend our testing to include other models that have demonstrated similar performance levels in the field of open question answering <ref type="bibr" target="#b33">[33]</ref>. This will help us identify the most effective model for our application with an unbiased evaluation <ref type="bibr" target="#b34">[34]</ref>.</p><p>Similarly, in the context of LLM evaluators, we also intend to explore additional models, including open-source options <ref type="bibr" target="#b35">[35,</ref><ref type="bibr" target="#b36">36]</ref>, that have shown strong performance in assessing the quality of responses from various LLMs. This approach is expected to increase the correlation between human and LLM evaluations, thereby enhancing the system's overall accuracy and reliability. The scientific community is very active in this area to better understand the limitations of the different types of models considered as evaluators <ref type="bibr" target="#b37">[37]</ref>.</p><p>By addressing the identified limitations through increased human involvement, expanded data coverage, and domain-specific evaluation methods, we believe it is possible to enhance the system's effectiveness and generalizability across a wide range of regulatory domains.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusion</head><p>This study explored a novel approach for generating automated responses to inquiries on the Regulation (EU) N.2013/575, specifically on the liquidity risk subject. We proposed a multi-step prompt construction method that enriches the context to be provided to LLMs, enabling them to generate more accurate and informative answers. An LLM Evaluator, which demonstrated strong agreement with human experts, was employed to compare our multi-step approach with standard zero-shot and fewshot methods that lack context enrichment. The quality of the generated responses was assessed, and our findings indicate that the multi-step approach significantly outperforms both the zero-shot and few-shot methods, resulting in responses that are more comprehensive and accurate in relation to the EBA regulation. These results suggest that the multi-step prompt construction is a promising approach for enhancing LLM performance in legal information retrieval tasks, particularly within domains with complex regulatory frameworks like regulatory reporting. Even at this early stage, the tool has demonstrated its ability to make the work of the human analyst more efficient. Future research directions include exploring the use of different LLM architectures and investigating alternative methods for incorporating human feedback into the prompt construction process. Lastly, exploring the generalization of this approach to other regulatory domains would be valuable. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>B.1. Multi-Step Approach for Answer Generation</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>C.3. LLM as Evaluator</head><p>Gpt4-omni Prompt I will provide you with two answers to a question. One is the #official answer, which serves as the benchmark. The other is the #generated answer, which needs to be evaluated against the #official answer. You must compare the answers step by step.</p><p>Consider the following definitions for this evaluation:</p><p>-Correctness: A #generated answer is correct if its content aligns with that of the #official answer.</p><p>-Completeness: A #generated answer is complete if it includes all the information present in the #official answer. Your task is to act as an evaluator and rate the #generated answer according to the following scale: RATING 1: The #generated answer is completely incorrect and incomplete compared to the #official answer. RATING 2: The #generated answer is incorrect but either complete or partially complete compared to the #official answer. It contains some useful information found in the #official answer but the main statement is incorrect. RATING 3: The #generated answer is correct but only partially complete. The main statement matches the #official answer, but some information from the #official answer is missing. RATING 4: The #generated answer is fully correct and complete. It is essentially a rephrased version of the #official answer with no significant differences. Please provide a single numerical rating (1-4) followed by a brief explanation for your rating This prompt was used to compare an AI-generated answer (#generated answer) to an official one (#official answer), rating its correctness, completeness, and providing an explanation.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 2 :Figure 3 :Figure 4 :</head><label>234</label><figDesc>Figure 2: Multi-Step Approach for Answer Generation</figDesc><graphic coords="11,89.29,102.79,416.70,143.80" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0"><head></head><label></label><figDesc></figDesc><graphic coords="10,150.77,356.40,291.69,247.70" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Sample distribution across training, validation, and test sets for CRR-related Q&amp;A and the subset of only Liquidity Risk Q&amp;A.</figDesc><table><row><cell>Set</cell><cell cols="2">CRR-related Q&amp;A Liquidity Risk Q&amp;A</cell></row><row><cell>Training</cell><cell>798</cell><cell>58</cell></row><row><cell>Validation</cell><cell>162</cell><cell>12</cell></row><row><cell>Test</cell><cell>637</cell><cell>46</cell></row><row><cell cols="3">variables: question ID, question, submission date, status,</cell></row><row><cell cols="3">topic, legal act, article [within that act], background infor-</cell></row><row><cell cols="3">mation,final answer, submission date and status (details</cell></row><row><cell cols="3">in Table 4, Appendix 4) Secondly, we implemented a two-</cell></row><row><cell cols="3">step filtering process aimed at ensuring model efficacy:</cell></row><row><cell cols="3">by excluding non-English entries, and by focusing on</cell></row><row><cell cols="3">CRR-related questions within the same timeframe. This</cell></row><row><cell cols="3">resulted in a final dataset of 1597 CRR-related questions</cell></row><row><cell cols="3">and answers, which was then split into training (50%),</cell></row><row><cell cols="3">validation (10%), and test sets (40%) for robust evaluation</cell></row><row><cell cols="3">(token number distribution in Figure 1 in Appendix A).</cell></row><row><cell cols="3">The distribution of samples for the dataset is summarized</cell></row><row><cell>in Table 1.</cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Recall scores on EBA Q&amp;As Test Dataset</figDesc><table><row><cell>Models</cell><cell cols="4">r@5 r@10 r@20 r@30</cell></row><row><cell>all-MiniLM</cell><cell>0.37</cell><cell>0.46</cell><cell>0.55</cell><cell>0.59</cell></row><row><cell>gte-large</cell><cell>0.39</cell><cell>0.48</cell><cell>0.57</cell><cell>0.63</cell></row><row><cell>bge-large</cell><cell>0.41</cell><cell>0.52</cell><cell>0.62</cell><cell>0.67</cell></row><row><cell>bge-reranker-large</cell><cell>0.17</cell><cell>0.23</cell><cell>0.31</cell><cell>0.38</cell></row><row><cell>bge-reranker-v2-m3</cell><cell>0.24</cell><cell>0.31</cell><cell>0.39</cell><cell>0.44</cell></row><row><cell cols="2">CRR Ranker (ours) 0.51</cell><cell>0.67</cell><cell>0.81</cell><cell>0.86</cell></row><row><cell cols="5">risk inquiries, using our LLM as the evaluation system</cell></row><row><cell cols="5">(Figure in Appendix B.3). To this end, we utilized a subset</cell></row><row><cell cols="5">of 46 Q&amp;As from our EBA Q&amp;A Test dataset specifically</cell></row><row><cell>focused on liquidity risk.</cell><cell></cell><cell></cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>Evaluation results for responses generated by zero-shot, fewshot and multi-step</figDesc><table><row><cell cols="4">Rating zero-shot few-shot multi-step (gpt4o)</cell></row><row><cell>1</cell><cell>6</cell><cell>12</cell><cell>2</cell></row><row><cell>2</cell><cell>18</cell><cell>11</cell><cell>14</cell></row><row><cell>3</cell><cell>19</cell><cell>16</cell><cell>26</cell></row><row><cell>4</cell><cell>3</cell><cell>7</cell><cell>4</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>The authors would like to express their sincere gratitude to Vincenzo Capone, Pamela Maggiori, Daniele Bovi, Fabio Zambuto, Francesca Monacelli, and Roberto Sabbatini (Bank of Italy) for their insightful comments and stimulating discussions on an earlier draft of this paper. Their feedback greatly enhanced the clarity and focus of our work. They would also like to thank the anonymous reviewers for their invaluable suggestions and constructive feedback.</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A. Dataset Table 4</head><p>EBA Q&amp;As dataset. For this research, we focused on the fields highlighted in yellow.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Variable Name Description</head><p>Question ID The unique identifier for each question.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Topic</head><p>The general topic or category under which the question falls.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Subject matter</head><p>The specific subject matter of the question.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Legal act</head><p>The specific legal act to which the question relates. (e.g., CRR) Article</p><p>The specific article of the legal to which the question relates. COM Delegated or Implementing Acts/RTS/ITS/GLs/Recommendations Other legislation, standards, guidelines or recommendations to which the question relates.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Article/Paragraph</head><p>The specific article or paragraph within the above-mentioned Question</p><p>The actual question asked. Background on the question Any additional information or context provided by the question submitter.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Final answer</head><p>The official answer provided to the question.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Submission date</head><p>The date when the question was submitted.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Final publishing date</head><p>The date when the final answer to the question was published.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Status</head><p>The current status of the question (e.g. Final, rejected, etc.).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Type of submitter</head><p>The type of entity that submitted the question (e.g. Credit institution, investment firm, etc.).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Answer prepared by</head><p>The entity that prepared the answer to the question. </p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<monogr>
		<title level="m" type="main">BloombergGPT: A Large Language Model for Finance</title>
		<author>
			<persName><forename type="first">S</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Irsoy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Dabravolski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dredze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gehrmann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Kambadur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Rosenberg</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Mann</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2303.17564</idno>
		<ptr target="http://arxiv.org/abs/2303.17564" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note>cs, q-fin</note>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<title level="m" type="main">Large Language Models in Law: A Survey</title>
		<author>
			<persName><forename type="first">J</forename><surname>Lai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Gan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Qi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S</forename><surname>Yu</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2312.03718</idno>
		<idno type="arXiv">arXiv:2312.03718</idno>
		<ptr target="http://arxiv.org/abs/2312.03718.doi:10.48550/arXiv.2312.03718" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<author>
			<persName><forename type="first">C</forename><surname>Biancotti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Camassa</surname></persName>
		</author>
		<idno type="DOI">10.2139/ssrn.4533699</idno>
		<ptr target="https://papers.ssrn.com/abstract=4533699.doi:10.2139/ssrn.4533699" />
		<title level="m">Loquacity and Visible Emotion: ChatGPT as a Policy Advisor</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<monogr>
		<title level="m" type="main">Large Language Models as Simulated Economic Agents: What Can We Learn from Homo Silicus?</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">J</forename><surname>Horton</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2301.07543v1" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Large language models and their possible uses in law</title>
		<author>
			<persName><forename type="first">P</forename><surname>Homoki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Ződi</surname></persName>
		</author>
		<idno type="DOI">10.1556/2052.2023.00475</idno>
		<ptr target="https://akjournals.com/view/journals/2052/64/3/article-p435.xml.doi:10.1556/2052.2023.00475" />
	</analytic>
	<monogr>
		<title level="m">Akadémiai Kiadó Section</title>
				<imprint>
			<date type="published" when="2024">2024</date>
			<biblScope unit="volume">64</biblScope>
			<biblScope unit="page" from="435" to="455" />
		</imprint>
	</monogr>
	<note>Hungarian Journal of Legal Studies</note>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<title level="m" type="main">Cbr-rag: Case-based reasoning for retrieval augmented generation in llms for legal question answering</title>
		<author>
			<persName><forename type="first">N</forename><surname>Wiratunga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Abeyratne</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Jayawardena</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Massie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Nkisi-Orji</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Weerasinghe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Liret</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Fleisch</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2404.04302.arXiv:2404.04302" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<monogr>
		<title level="m" type="main">Interpretable Long-Form Legal Question Answering with Retrieval-Augmented Large Language Models</title>
		<author>
			<persName><forename type="first">A</forename><surname>Louis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Van Dijck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Spanakis</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2309.17050</idno>
		<idno type="arXiv">arXiv:2309.17050</idno>
		<ptr target="http://arxiv.org/abs/2309.17050.doi:10.48550/arXiv.2309.17050" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">GLQA: A Generation-based Method for Legal Question Answering</title>
		<author>
			<persName><forename type="first">W</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<idno type="DOI">10.1109/IJCNN54540.2023.10191483</idno>
		<ptr target="2161-4407" />
	</analytic>
	<monogr>
		<title level="m">International Joint Conference on Neural Networks (IJCNN)</title>
				<imprint>
			<date type="published" when="2023">2023. 2023</date>
			<biblScope unit="page" from="1" to="8" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Exploring the state of the art in legal QA systems</title>
		<author>
			<persName><forename type="first">A</forename><surname>Abdallah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Piryani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jatowt</surname></persName>
		</author>
		<idno type="DOI">10.1186/s40537-023-00802-8</idno>
		<ptr target="https://doi.org/10.1186/s40537-023-00802-8.doi:10.1186/s40537-023-00802-8" />
	</analytic>
	<monogr>
		<title level="j">Journal of Big Data</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page">127</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Prenio</surname></persName>
		</author>
		<ptr target="https://www.bis.org/fsi/publ/insights58.htm" />
		<title level="m">Peering through the hype -assessing suptech tools&apos; transition from experimentation to supervision</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<author>
			<persName><forename type="first">L</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Zhong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Qin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Liu</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2311.05232.arXiv:2311.05232" />
		<title level="m">A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<monogr>
		<ptr target="https://www.eba.europa.eu/single-rule-book-qa" />
		<title level="m">Single Rulebook Q&amp;A | European Banking Authority</title>
				<imprint>
			<date type="published" when="2013">2013-2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<title level="m" type="main">FLASK: Finegrained Language Model Evaluation based on Alignment Skill Sets</title>
		<author>
			<persName><forename type="first">S</forename><surname>Ye</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hwang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Thorne</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Seo</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2307.10928</idno>
		<idno type="arXiv">arXiv:2307.10928</idno>
		<ptr target="http://arxiv.org/abs/2307.10928.doi:10.48550/arXiv.2307.10928" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<title level="m" type="main">Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena</title>
		<author>
			<persName><forename type="first">L</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-L</forename><surname>Chiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">P</forename><surname>Xing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Gonzalez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Stoica</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2306.05685</idno>
		<idno type="arXiv">arXiv:2306.05685</idno>
		<ptr target="http://arxiv.org/abs/2306.05685.doi:10.48550/arXiv.2306.05685" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">G-Eval: NLG Evaluation using Gpt-4 with Better Human Alignment</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Iter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhu</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.emnlp-main.153</idno>
		<ptr target="https://aclanthology.org/2023.emnlp-main.153.doi:10.18653/v1/2023.emnlp-main.153" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">H</forename><surname>Bouamor</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Pino</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Bali</surname></persName>
		</editor>
		<meeting>the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="2511" to="2522" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<title level="m" type="main">ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate</title>
		<author>
			<persName><forename type="first">C.-M</forename><surname>Chan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Su</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Xue</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2308.07201</idno>
		<idno type="arXiv">arXiv:2308.07201</idno>
		<ptr target="http://arxiv.org/abs/2308.07201.doi:10.48550/arXiv.2308.07201" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">J</forename><surname>Openai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Achiam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Adler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Ahmad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">L</forename><surname>Akkaya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Aleman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Almeida</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Altenschmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Altman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Anadkat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Avila</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Babuschkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Balaji</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Balcom</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Baltescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bavarian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Belgum</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bello</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Berdine</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bernadett-Shapiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Berner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Bogdonoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Boiko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A.-L</forename><surname>Boyd</surname></persName>
		</author>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title/>
		<author>
			<persName><forename type="first">G</forename><surname>Brakman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Brockman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Brooks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Brundage</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Button</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Cai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Campbell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Cann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Carey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Carlson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Carmichael</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Chan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Chantzis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Chess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">W</forename><surname>Chu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Chung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Cummings</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Currier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Dai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Decareaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Degry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Deutsch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Deville</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dhar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Dohan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Dowling</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Dunning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ecoffet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Eleti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Eloundou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Farhi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Fedus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">P</forename><surname>Felix</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fishman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Forte</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Fulford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Georges</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Gibson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Goel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Gogineni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Goh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gontijo-Lopes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Gordon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Grafstein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Greene</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Gross</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hallacy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Han</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Harris</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Heaton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Heidecke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Hesse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Hickey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Hickey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Hoeschele</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Houghton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hsu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Huizinga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Jomoto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jonn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Jun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kaftan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Kamali</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">S</forename><surname>Kanitscheider</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Keskar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Khan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">W</forename><surname>Kilpatrick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kirchner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kiros</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Knight</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kokotajlo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kondraciuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kondrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Konstantinidis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Kosic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Krueger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kuo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Lampe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Leike</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Leung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Litwin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Lopez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Lowe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lue</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Makanju</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Malfacini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Manning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Markov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Markovski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mayer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mayne</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Mcgrew</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Mckinney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mcleavey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mcmillan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Mcneil</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Medina</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Menick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Metz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mishchenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Mishkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Monaco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Morikawa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mossing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Murati</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Murk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mély</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Nair</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Nakano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Nayak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Neelakantan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ngo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Noh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ouyang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>O'keefe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pachocki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Paino</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Palermo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Pantuliano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Parascandolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Parish</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Parparita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Passos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pavlov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">D A B</forename><surname>Perelman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Peres</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">P D O</forename><surname>Petrov</surname></persName>
		</author>
		<author>
			<persName><surname>Pinto</surname></persName>
		</author>
		<author>
			<persName><surname>Michael</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Pokorny</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><forename type="middle">H</forename><surname>Pokrass</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Pong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Powell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Power</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Power</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Proehl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Puri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rae</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Raymond</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Real</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Rimbach</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Ross</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Rotsted</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Roussez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ryder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Saltarelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Sanders</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Santurkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schnurr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schulman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Selsam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Sheppard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sherbakov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shieh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Shoker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shyam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Sidor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Sigler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Simens</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Sitkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Slama</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Sohl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sokolowsky</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">P</forename><surname>Staudacher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Such</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Summers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sutskever</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">B</forename><surname>Tezak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Thompson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tillet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Tootoonchian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Tseng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tuggle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Turley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">F C</forename><surname>Tworek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Uribe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Vallone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Vijayvergiya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Voss</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">J</forename><surname>Wainwright</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ward</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Weinmann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Welihinda</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Welinder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Weng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Weng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Wiethoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Willner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wolrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Workman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Yoo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Zaremba</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zellers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Zhuk</surname></persName>
		</author>
		<author>
			<persName><surname>Zoph</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2303.08774</idno>
		<idno type="arXiv">arXiv:2303.08774</idno>
		<ptr target="http://arxiv.org/abs/2303.08774.doi:10.48550/arXiv.2303.08774" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">GPT-4 Technical Report</note>
</biblStruct>

<biblStruct xml:id="b18">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Luo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Lian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2402.03216</idno>
		<title level="m">Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Muennighoff</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2309.07597</idno>
		<title level="m">C-pack: Packaged resources to advance general chinese embedding</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<title level="m" type="main">Hard negative examples are hard, but useful</title>
		<author>
			<persName><forename type="first">H</forename><surname>Xuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Stylianou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Pless</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2007.12749.arXiv:2007.12749" />
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Muennighoff</surname></persName>
		</author>
		<ptr target="https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker" />
		<title level="m">FlagEmbedding/FlagEmbedding/reranker at master • FlagOpen/FlagEmbedding</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<monogr>
		<title level="m" type="main">Language models are few-shot learners</title>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">B</forename><surname>Brown</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ryder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Subbiah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kaplan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dhariwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Neelakantan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Shyam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Askell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Herbert-Voss</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Krueger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Henighan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Child</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Ziegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hesse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Sigler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Litwin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Clark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Berner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mccandlish</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Amodei</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2005.14165.arXiv:2005.14165" />
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<title level="m" type="main">Geval: Nlg evaluation using gpt-4 with better human alignment</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Iter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhu</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2303.16634.arXiv:2303.16634" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<monogr>
		<title level="m" type="main">Alpacafarm: A simulation framework for methods that learn from human feedback</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Dubois</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Taori</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Gulrajani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ba</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Guestrin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">B</forename><surname>Hashimoto</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2305.14387.arXiv:2305.14387" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<monogr>
		<title level="m" type="main">Gptscore: Evaluate as you desire</title>
		<author>
			<persName><forename type="first">J</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S.-K</forename><surname>Ng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liu</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2302.04166.arXiv:2302.04166" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">D</forename><surname>Manning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Raghavan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schütze</surname></persName>
		</author>
		<title level="m">Introduction to Information Retrieval</title>
				<meeting><address><addrLine>USA</addrLine></address></meeting>
		<imprint>
			<publisher>Cambridge University Press</publisher>
			<date type="published" when="2008">2008</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<monogr>
		<title level="m" type="main">PAQ: 65 million probably-asked questions and what you can do with them</title>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S H</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Minervini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Küttler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piktus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Stenetorp</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Riedel</surname></persName>
		</author>
		<idno>CoRR abs/2102.07033</idno>
		<ptr target="https://arxiv.org/abs/2102.07033.arXiv:2102.07033" />
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<monogr>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Long</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2308.03281</idno>
		<title level="m">Towards general text embeddings with multi-stage contrastive learning</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<title level="m" type="main">Making large language models a better foundation for dense retrieval</title>
		<author>
			<persName><forename type="first">C</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xiao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Shao</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2312.15503</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<analytic>
		<title level="a" type="main">Exploring the effectiveness of prompt engineering for legal reasoning tasks</title>
		<author>
			<persName><forename type="first">F</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Quartey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Schilder</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.findings-acl.858</idno>
		<ptr target="https://aclanthology.org/2023.findings-acl.858.doi:10.18653/v1/2023.findings-acl.858" />
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: ACL 2023, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">A</forename><surname>Rogers</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Boyd-Graber</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Okazaki</surname></persName>
		</editor>
		<meeting><address><addrLine>Toronto, Canada</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="13582" to="13596" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b31">
	<analytic>
		<title level="a" type="main">yuan at semeval-2024 task 5: Enhancing legal argument reasoning with structured prompts</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Kao</surname></persName>
		</author>
		<ptr target="https://api.semanticscholar.org/CorpusID:270765544" />
	</analytic>
	<monogr>
		<title level="m">International Workshop on Semantic Evaluation</title>
				<imprint>
			<date type="published" when="2024">0x. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<monogr>
		<title level="m" type="main">Prompt engineering a prompt engineer</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Ye</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Axmed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Pryzant</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Khani</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2311.05661.arXiv:2311.05661" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<monogr>
		<title level="m" type="main">Olympicarena medal ranks: Who is the most intelligent ai so far?</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liu</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2406.16772.arXiv:2406.16772" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<monogr>
		<title level="m" type="main">Llm evaluators recognize and favor their own generations</title>
		<author>
			<persName><forename type="first">A</forename><surname>Panickssery</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">R</forename><surname>Bowman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Feng</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2404.13076.arXiv:2404.13076" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b35">
	<monogr>
		<title level="m" type="main">Prometheus 2: An open source language model specialized in evaluating other language models</title>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Suk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Longpre</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">Y</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Welleck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Neubig</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Seo</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2405.01535.arXiv:2405.01535" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<monogr>
		<title level="m" type="main">The biggen bench: A principled benchmark for fine-grained evaluation of language models with language models</title>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Suk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">Y</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Longpre</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Yoon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Son</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shafayat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Baek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">H</forename><surname>Park</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hwang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Jo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Oh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">J</forename><surname>Joo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Chae</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Jang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ye</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">Y</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Welleck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Neubig</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Seo</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2406.05761.arXiv:2406.05761" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b37">
	<monogr>
		<title level="m" type="main">On the limitations of fine-tuned judge models for llm evaluation</title>
		<author>
			<persName><forename type="first">H</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Qu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhao</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2403.02839.arXiv:2403.02839" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
