<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Retrieving Information Presented on Web Pages Using Large Language Models: A Case Study</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Thomas</forename><surname>Asselborn</surname></persName>
							<email>thomas.asselborn@uni-hamburg.de</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Humanities-Centered Artificial Intelligence</orgName>
								<orgName type="institution">Universität Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 28</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Centre for the Study of Manuscript Cultures</orgName>
								<orgName type="institution">Universität Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 26</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Cluster of Excellence &apos;Understanding Written Artefacts&apos; (UWA)</orgName>
								<orgName type="institution">University of Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 26</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Karsten</forename><surname>Helmholz</surname></persName>
							<email>karsten.helmholz@uni-hamburg.de</email>
							<affiliation key="aff1">
								<orgName type="department">Centre for the Study of Manuscript Cultures</orgName>
								<orgName type="institution">Universität Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 26</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Cluster of Excellence &apos;Understanding Written Artefacts&apos; (UWA)</orgName>
								<orgName type="institution">University of Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 26</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ralf</forename><surname>Möller</surname></persName>
							<email>ralf.moeller@uni-hamburg.de</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Humanities-Centered Artificial Intelligence</orgName>
								<orgName type="institution">Universität Hamburg</orgName>
								<address>
									<addrLine>Warburgstraße 28</addrLine>
									<postCode>20354</postCode>
									<settlement>Hamburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff3">
								<orgName type="department">German Conference on Artificial Intelligence</orgName>
								<address>
									<addrLine>September 23</addrLine>
									<postCode>47th, 2024</postCode>
									<settlement>Würzburg</settlement>
									<country key="DE">Germany</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Retrieving Information Presented on Web Pages Using Large Language Models: A Case Study</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">A8A5C4E5A10F81980FAE1221312BF4B5</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T18:16+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Artefact Profiling</term>
					<term>Large Language Models (LLMs)</term>
					<term>Generative Pre-Trained Transformer (GPT)</term>
					<term>Retrieval Augmented Generation (RAG)</term>
					<term>Web Pages</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Developing web pages is a task that requires constant updating of new information. Additionally, multiple web pages with the same information must be developed but compiled differently if multiple user groups are targeted. Thus, we introduce a new approach that uses LLMs (Large Language Models), RAG (Retrieval Augmented Generation) and SCDs (Subjective Content Descriptions) to query the information on a web page and also provide sources to the original data. Since the LLM can rewrite the response based on the target user group, this reduces the need to make multiple web pages. Based on the example of the Artefact Profiling Guide, this approach reduces the need to provide user-specific web pages. Additionally, this method reduces the need for an expert in web programming and designing by offloading the task of presenting new data to the LLM. The prototype system has shown promising results so far. It provides the correct answers grounded by the source and written appropriately for the target group.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Web pages, like those for universities or corporations, are typically segmented into parts that offer information targeted to a specific user group. A university web page is usually divided into sections specifically curated to provide information for students, people interested in studying, employees, other researchers, the general public, etc. While this approach works fine, as shown in practice, there are specific scenarios where one may want to offer a single web page targeted to many user groups that may not even be known a priori. One example is that the information provided may interest a large group of people with different needs, be it pupils or scientific researchers. In this case, offering different web pages with duplicate content would generally be necessary, each curated for the respective audience. Alternatively, one has the option of offering a single web page written in such a way that it can address all target groups at once. However, web pages designed to be an all-in-one solution run the risk of making them too complicated for some yet too easy for others, thereby disappointing most if not all, users.</p><p>LLMs utilising the transformer architecture <ref type="bibr" target="#b0">[1]</ref> like GPT (Generative Pre-trained Transformer) <ref type="bibr" target="#b1">[2]</ref> have several beneficial properties for making them an ideal candidate to be used on web pages with different and diverse user groups. On the one hand, they can be tailored to a specific downstream task, allowing them to acquire specific knowledge. Additionally, they can adapt their language to the specific user writing queries, e.g., by telling the user that the query should be explained to a 10-year-old child.</p><p>In this paper, we thus propose a different method (Section 3) of dealing with the problem of generating multiple distinct web pages by using LLMs to return the content a user may ask about and to provide citations to the original resource. It aims to investigate whether it is possible to replace web pages with a ChatGPT-like chatbot (Section 4) and which benefits and potential problems this approach may have (Sections 4 and 5). Before going into detail, Section 2 introduces the Artefact Profiling Guide.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.1.">Related Work</head><p>Working with LLMs and RAG is an actively researched topic. Gao et al. <ref type="bibr" target="#b2">[3]</ref> are comparing different methods of applying the general principles of RAG. Ramesh et al. <ref type="bibr" target="#b3">[4]</ref> have proposed context tuning for RAG to improve retrieval.</p><p>RAG is a method that several companies also use in practice to increase productivity. Examples include but are not limited to Telescope (a sales automation platform), Assembly (a human resources platform) and Causal (a financial planning tool). <ref type="bibr" target="#b4">[5]</ref> Also, bigger companies like IBM, Google, NVIDIA, and Microsoft are using RAG for various tasks. <ref type="bibr" target="#b5">[6]</ref> Using LLMs to generate web pages is a feature that Perplexity introduced a few months ago, which they called Perplexity Pages<ref type="foot" target="#foot_0">1</ref> . There, Perplexity queries and results can be displayed automatically on a web page. This helps to produce web pages without the need to know web development and design. However, their approach focuses on making web pages with information already present on the internet and compiling them into a single view. In contrast, our approach is focused on retrieving information that would typically be displayed on a web page with the help of LLMs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">The Artefact Profiling Guide</head><p>The Artefact Profiling Guide is an online guide written by members of various lab teams of CSMC (Centre for the Study of Manuscript Cultures). Its goal is to explain the scientific methods and analytical means used to analyse written artefacts to foster interdisciplinary work between the humanities, the natural sciences and computer science. It is meant as a broad overview with more detailed explanations provided as links to dedicated sources.</p><p>While the guide is primarily written for a target group of researchers and specialists, it is also meant to provide information to other potential user groups, such as libraries, museums, and other collections that do not necessarily have the expertise or the equipment to do analyses. Another potential user group may be private collectors who want more information about an artefact in their collection. Beyond those specific user groups, the guide is also meant to provide the general public with information about some aspects of the research carried out at CSMC. The general public is a blurry group that could range from young pupils to older people wanting to broaden their knowledge and anyone in between. This variety of target groups makes it difficult to provide the desired explanation level and find a language appropriate for all users.</p><p>Artefact profiling is an interdisciplinary field combining analytical means from various fields to understand written artefacts. It combines, among others not mentioned here, multiple so-called "omics" approaches:</p><p>• Genomics: This studies the genetic material in the (written) artefact.</p><p>• Proteomics: Analysation of proteins and peptides.</p><p>• Metabolomics: Metabolites and small molecules are examined.</p><p>• Metallomics: This deals with the metals and their distribution.</p><p>• Isotopolomics: Isotope ratios in the written artefacts are studied.</p><p>Artefact profiling aids in dating, locating and authenticating written artefacts. <ref type="bibr" target="#b6">[7]</ref> Artefact profiling is a dynamic field of research with new approaches emerging and new devices being developed to implement them. (see<ref type="foot" target="#foot_1">2</ref> ). Additionally, researchers are switching universities and positions frequently. Thus, the Artefact Profiling Guide needs to be updated regularly.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Method</head><p>The data in the artefact profiling guide is constantly evolving and probably also not part of most LLMs pre-training. Thus, a way to ensure the LLM can understand new information must be used. Currently, there are two main methods of making sure that the LLM works better on a specific downstream task:</p><p>1. Fine-tuning: In fine-tuning, one takes a so-called pre-trained model, like Llama 3 <ref type="bibr" target="#b7">[8]</ref>, which is typically trained in a self-supervised<ref type="foot" target="#foot_2">3</ref> way. The model is typically fine-tuned in a supervised way using a smaller task-specific data set. With fine-tuning, the model´s parameters are changed. While it is far less demanding regarding data set size and hardware resources than pre-training and is thus more feasible for smaller companies or research groups, fine-tuning still needs a GPU to be done efficiently. 2. RAG [9]: RAG is another method of making sure that the LLM works better on a specific downstream task. In contrast to fine-tuning, the internal parameters of the models are not changed; instead, context is provided using automatic prompt augmentation. Typically, the data the user wants to add to the LLM is encoded in a vector database using, e.g., BERT <ref type="bibr" target="#b9">[10]</ref> embeddings.</p><p>When the user sends a query, it is first encoded using the same encoding, the k best results from the vector database are returned and then automatically appended to the query, which is then sent to the LLM. This works because LLMs have a property called in-context learning <ref type="bibr" target="#b10">[11]</ref>, allowing the LLM to answer queries about a topic that was never seen in the pre-training data set.</p><p>We have decided to use RAG for our approach as this is less demanding in terms of hardware requirements, i.e., while fine-tuning requires a dedicated GPU, RAG also works with CPUs. Additionally, this is necessary because the data in the Artefact Profiling Guide is dynamically changing.  Everything on the right of the line is executed after every user query.</p><p>Figure <ref type="figure" target="#fig_0">1</ref> shows a broad overview of the process. It is divided into two sections. On the left of the dotted red line is the pre-processing. This only gets executed when a change to the data source, in this case, the Artefact Profiling Guide, has happened. From there, the embeddings to be stored in the vector database and the SCDs <ref type="bibr" target="#b11">[12]</ref> are computed. SCDs contain additional data that is attached to locations in a text document. They can be descriptions, links, or labels and can be automatically generated. Thus, both pre-processing steps are completely automated without the need for human intervention.</p><p>The process on the right of the dotted, red line is executed at every user query. It can be split up into the following steps. It follows the same structure that was already introduced in <ref type="bibr" target="#b12">[13]</ref> as ChatHA.</p><p>1. The user enters a query and selects a user group. A few groups and example queries may be predefined (see Figure <ref type="figure" target="#fig_2">2a</ref>). 2. After the user has entered its query, it is embedded using the same algorithm that was used in the pre-processing, e.g., Sentence BERT <ref type="foot" target="#foot_3">4</ref> . This is then used to retrieve the n most similar entries from the vector database using, e.g., using the cosine similarity or Euclidean distance <ref type="bibr" target="#b13">[14]</ref>. The results are then used to augment the original user query. Additionally, the query is also augmented by the user group that was selected prior. 3. Having the augmented query, this is then used to send the query to the LLM. This could be the OpenAI API using GPT4 <ref type="bibr" target="#b14">[15]</ref> or a local, open-source LLM like Llama 3 <ref type="bibr" target="#b7">[8]</ref>. 4. The output from the LLM, which we call raw output, is then annotated with links to the original data source. Using the Most Probably Suited SCD (MPS 2 CD) algorithm <ref type="bibr" target="#b15">[16]</ref>, the most suitable SCDs from the set of known SCDs (stored in the SCD matrix) are computed for the raw output. This then provides the citation with links to the source. Figure <ref type="figure" target="#fig_2">2b</ref> shows how this may look practically 5. The final output with the citations is shown to the user.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experiment</head><p>A few tests have been performed to test this approach. First, performance for the pre-processing step was evaluated, i.e., the time it took to calculate the embeddings for the vector database and the SCDs. Everything was computed on a Macbook Pro with an M3 processor and 16GB of RAM, which is no special hardware. Thus, it is reasonable to assume that it should take roughly the same time. Since both processes are independent, they can be executed in parallel, which was also done for this test. We used ChromaDB <ref type="foot" target="#foot_4">5</ref> with the default Sentence BERT embeddings for the vector database. The results are as follows:</p><p>• Total number of words in the document: 67,438 • Total time to calculate and store embeddings: 10.5 min.</p><p>• Total time to calculate and store SCDs: 26.5 min.</p><p>All the numbers are in a range that makes it possible to be rerun regularly, e.g., once per week.</p><p>As a second experiment, a few test questions were asked to the system. This is then evaluated qualitatively. The key results are the following.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>RAG does work in our test example</head><p>Mostly, the answers returned by the system are correct and factually backed by the data source. Also, this helped to answer questions that a standard LLM like Llama 3 could not answer. One such example is:</p><p>• Question: "What is ENCI?"  • Answer: "Based on the provided context, ENCI is mentioned as a type of equipment used at the CSMC (Center for the Study of Manuscript Cultures) for non-destructive 3D material characterisation. The equipment is described as being able to study surface topography and mechanical properties, and its mobility is considered to be stationary. It is also mentioned that ENCI can be used for in-situ measurements with portable equipment. It appears that ENCI is a specialised tool used for the analysis of materials and artifacts, particularly in the field of manuscript cultures and studies. "</p><p>Standard Llama 3 was unable to answer that question at all.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>System is capable of automatically changing the "language" used to answer</head><p>The system aims to give answers in a language appropriate to the specific user group. The following example illustrates this:</p><p>• Question: "What is Raman spectroscopy?"</p><p>• Answer for 10-year old: "Raman spectroscopy is a special way to look at things without touching them or hurting them. It's like taking a picture of the things, but instead of using a camera, it uses special light to see what's inside the things. And it's very good for looking at old and special things, like art and treasures, because it doesn't hurt them at all!" • Answer for a private collector: "Dear fellow collector, Raman spectroscopy is a fantastic technique that allows us to examine and analyse our beloved artefacts without causing any harm to them. It's a non-invasive and nondestructive method that uses special light to "see" what's inside our objects, allowing us to learn more about their composition, materials, and even their age. It's been particularly useful in the field of Cultural Heritage, where preserving our treasures is of utmost importance. " • Answer for a researcher: "As a researcher, I can tell you that Raman spectroscopy is a non-invasive and non-destructive analytical technique that relies on the inelastic scattering of monochromatic light by atomic vibrations in the visible, near-infrared, and near-ultraviolet range. This technique has gained significant popularity in the field of Cultural Heritage over the past few decades, as it allows us to analyse samples without causing any harm. The advantages of Raman spectroscopy include its ability to be truly non-destructive. "</p><p>Citation system with SCDs has issues when the language used in answer differs significantly from original language</p><p>While the system can provide the appropriate language depending on the target group, this leads to problems with generating citations. Depending on how the LLM rewrites the answer, the MPS 2 CD algorithm may only sometimes be able to find the correct citations. This needs to be investigated in the future in more detail.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Problems and Challenges</head><p>Working with LLMs still has some issues that may be faced also in the context of the Artefact Profiling Guide. The first observation was that the automatic prompt extension influenced the results significantly.</p><p>If the user is interested in information only grounded by the provided documents, it should be clearly stated in the prompt extension. Otherwise, the LLM also tends to give general information in the prompt. Depending on the task, this may or may not be wanted. While RAG is a method that aims at reducing hallucinations, the LLM may still give (partially) incorrect answers. One example is the following:</p><p>• Question: "What is the CSMC?" • Answer (using Llama 3 Instruct 8B): "I apologise for not providing context earlier! The CSMC likely refers to the Center for Systems Medicine and Computational Biology, which appears to be a research institution or organisation. However, without more information, it's difficult to determine the specific details about this center.</p><p>If you could provide more context or clarify what the CSMC is, I'd be happy to help with your original question!"</p><p>This incorrect answer was provided even though the documents provided had the necessary information to give a correct answer.</p><p>Opening an LLM to be used by the public also adds problems that cannot be solved by using RAG or even fine-tuning. They mainly arise because the system should be openly available without logging into it. A few of these issues are discussed here.</p><p>One issue that may occur is that opening the LLM without user registration (registration is probably not what the provider of a public web page wants) opens the possibility for Denial-of-Service (DoS) attacks, i.e., the input is flooded with queries that use up all of the hardware resources on the server making the web page unreachable for all legitimate users. Since LLMs have a higher demand for hardware than standard web pages, the limit will likely hit sooner. Thus, mitigation methods need to be thought of thoroughly.</p><p>Additionally, user protection must be incorporated into the system, especially for children. One of the texts in the data set has a medieval Arabic medicine book as an example containing medical exams and treatments. This also includes descriptions and illnesses that may not be suitable for children, yet it would still be technically correct for the LLM to return them as an answer. Some ways must be introduced to avoid giving inappropriate but technically correct answers.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusion and Outlook</head><p>In this paper, we introduced a new method of getting and providing information on a web page using LLM and RAG. We showed that it is feasible, but we also pointed out some potential issues that needed to be mitigated, such as DoS attacks or child safety issues. Additionally, changing the concrete prompt used for automatic prompt augmentation makes a difference in how the LLM will answer.</p><p>Future focus should be on how citations are handled when the answer uses significantly different words while still giving the correct answer, i.e., when the LLM answers in simple words while the original data source is written in a scientific language. Additionally, user tests with a more extensive user group may provide more insights into evaluating the introduced method. Currently, the method only works with text data, with future work investigating methods of incorporating images and videos that are integrated into the RAG pipeline and in the citation with the SCDs. It would also be beneficial to extend the system to understand the user automatically without telling it which user group the user belongs to.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Broad overview of the underlying process. Everything on the left of the dotted, red line is part of the pre-processing. This only needs to be done once per change of the content of the Artefact Profiling Guide. Everything on the right of the line is executed after every user query.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head></head><label></label><figDesc>(a) Mockup of how the web page may look like. User can select their group or a free input to target generation for their specific needs. Additionally, one can consider adding a few example queries as a starting point (not yet implemented). The pink field with the number inside indicates references the user can follow.(b) Example, how citations may look like after clicking on the pink field as seen in Figure2a. In this example, information came from a PDF file but can potentially come from any type of text data.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Prototype implementation of the web page replacement. The top shows the question and answer, while the bottom shows what one citation may look like.</figDesc><graphic coords="5,83.28,414.47,428.73,183.18" type="bitmap" /></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">https://www.perplexity.ai/de/hub/blog/perplexity-pages</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">https://www.csmc.uni-hamburg.de/publications/blog/2024-04-30-enci-inauguration.html</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">https://www.ibm.com/topics/self-supervised-learning</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_3">https://sbert.net/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_4">https://www.trychroma.com/</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>The research for this contribution was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy -EXC 2176 'Understanding Written Artefacts: Material, Interaction and Transmission in Manuscript Cultures', project no. 390893796. The research was conducted within the scope of the Centre for the Study of Manuscript Cultures (CSMC) at Universität Hamburg.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Attention is all you need</title>
		<author>
			<persName><forename type="first">A</forename><surname>Vaswani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Parmar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Jones</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">N</forename><surname>Gomez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">U</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Polosukhin</surname></persName>
		</author>
		<ptr target="https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf" />
	</analytic>
	<monogr>
		<title level="m">Advances in Neural Information Processing Systems</title>
				<editor>
			<persName><forename type="first">I</forename><surname>Guyon</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">U</forename><forename type="middle">V</forename><surname>Luxburg</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Bengio</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Wallach</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Fergus</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Vishwanathan</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Garnett</surname></persName>
		</editor>
		<imprint>
			<publisher>Curran Associates, Inc</publisher>
			<date type="published" when="2017">2017</date>
			<biblScope unit="volume">30</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Narasimhan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Salimans</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<ptr target="https://openai.com/research/language-unsupervised" />
		<title level="m">Improving language understanding by generative pre-training</title>
				<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><surname>Gao</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2312.10997</idno>
		<title level="m">Retrieval-augmented generation for large language models: A survey</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Context tuning for retrieval augmented generation</title>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Bethi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Vodianik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">V</forename><surname>Chappidi</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2312.05708" />
	</analytic>
	<monogr>
		<title level="m">EACL Workshop</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<title level="m" type="main">7 examples of retrieval-augmented generation (rag)</title>
		<author>
			<persName><forename type="first">J</forename><surname>Gitlin</surname></persName>
		</author>
		<ptr target="https://www.merge.dev/blog/rag-examples" />
		<imprint>
			<date type="published" when="2024-06-30">2024. 2024-06-30</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<title level="m" type="main">What is retrieval-augmented generation aka rag?</title>
		<ptr target="https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/" />
		<imprint>
			<date type="published" when="2024-06-30">2024. 2024-06-30</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Artefact profiling: Panomics approaches for understanding the materiality of written artefacts</title>
		<author>
			<persName><forename type="first">M</forename><surname>Creydt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Fischer</surname></persName>
		</author>
		<idno type="DOI">10.3390/molecules28124872</idno>
	</analytic>
	<monogr>
		<title level="j">Molecules</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="page">4872</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<title level="m" type="main">Introducing meta llama 3: The most capable openly available llm to date</title>
		<author>
			<persName><forename type="first">M</forename><surname>Ai</surname></persName>
		</author>
		<ptr target="https://ai.meta.com/blog/meta-llama-3/" />
		<imprint>
			<date type="published" when="2024-05-23">2024. 2024-05-23</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<title level="m" type="main">Retrieval-augmented generation for knowledge-intensive nlp tasks</title>
		<author>
			<persName><forename type="first">P</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Perez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piktus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Petroni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karpukhin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Küttler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yih</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Rocktäschel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Riedel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kiela</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2005.11401</idno>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<title level="m" type="main">BERT: pre-training of deep bidirectional transformers for language understanding</title>
		<author>
			<persName><forename type="first">J</forename><surname>Devlin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Toutanova</surname></persName>
		</author>
		<idno>CoRR abs/1810.04805</idno>
		<ptr target="http://arxiv.org/abs/1810.04805.arXiv:1810.04805" />
		<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Min</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Lyu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Holtzman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Artetxe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hajishirzi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2202.12837</idno>
		<title level="m">Rethinking the role of demonstrations: What makes in-context learning work?</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">To Extend or not to Extend? Context-specific Corpus Enrichment</title>
		<author>
			<persName><forename type="first">F</forename><surname>Kuhr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Braun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bender</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Möller</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-3-030-35288-2_29</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of AI: Advances in Artificial Intelligence</title>
				<meeting>AI: Advances in Artificial Intelligence</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="357" to="368" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Fine-tuning bert models on demand for information systems explained using training data from pre-modern arabic</title>
		<author>
			<persName><forename type="first">T</forename><surname>Asselborn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Melzer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Aljoumani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bender</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">A</forename><surname>Marwitz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Hirschler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Möller</surname></persName>
		</author>
		<ptr target="https://ceur-ws.org/Vol-3580/paper5.pdf" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Workshop on Humanities-Centred Artificial Intelligence (CHAI 2023)</title>
				<meeting>the Workshop on Humanities-Centred Artificial Intelligence (CHAI 2023)</meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="38" to="51" />
		</imprint>
	</monogr>
	<note>CEUR Workshop Proceedings</note>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Singhal</surname></persName>
		</author>
		<ptr target="http://singhal.info/ieee2001.pdf" />
		<title level="m">Modern information retrieval: A brief overview</title>
				<imprint>
			<date type="published" when="2001">2001. 2024-06-28</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<monogr>
		<title/>
		<author>
			<persName><surname>Openai</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2303.08774</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">Gpt-4 technical report</note>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Augmenting and automating corpus enrichment</title>
		<author>
			<persName><forename type="first">F</forename><surname>Kuhr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bender</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Braun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Möller</surname></persName>
		</author>
		<idno type="DOI">10.1142/S1793351X20400061</idno>
	</analytic>
	<monogr>
		<title level="j">Int. J. Semantic Computing</title>
		<imprint>
			<biblScope unit="volume">14</biblScope>
			<biblScope unit="page" from="173" to="197" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
