<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Riccardo</forename><surname>Orlando</surname></persName>
							<email>orlando@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Luca</forename><surname>Moroni</surname></persName>
							<email>moroni@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Pere-Lluís</forename><surname>Huguet Cabot</surname></persName>
							<email>huguetcabot@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Edoardo</forename><surname>Barba</surname></persName>
							<email>barba@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Simone</forename><surname>Conia</surname></persName>
							<email>conia@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sergio</forename><surname>Orlandini</surname></persName>
							<email>s.orlandini@cineca.it</email>
							<affiliation key="aff1">
								<orgName type="institution">CINECA</orgName>
								<address>
									<settlement>Bologna</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giuseppe</forename><surname>Fiameni</surname></persName>
							<email>gfiameni@nvidia.com</email>
							<affiliation key="aff2">
								<orgName type="institution">NVIDIA</orgName>
								<address>
									<settlement>Santa Clara</settlement>
									<region>California</region>
									<country key="US">USA</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Roberto</forename><surname>Navigli</surname></persName>
							<email>navigli@diag.uniroma1.it</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Dipartimento di Ingegneria Informatica</orgName>
								<orgName type="department" key="dep2">Automatica e Gestionale</orgName>
								<orgName type="laboratory">Sapienza NLP Group</orgName>
								<orgName type="institution">Sapienza University of Rome</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff3">
								<orgName type="department">Tenth Italian Conference on Computational Linguistics</orgName>
								<address>
									<addrLine>Dec 04 -06</addrLine>
									<postCode>2024</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">80F99ADF63E2489B537AA61E179BCA87</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:33+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Large Language Models</term>
					<term>Language Modeling</term>
					<term>Italian Language</term>
					<term>LLM Pretraining</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>The growing interest in Large Language Models (LLMs) has accelerated research efforts to adapt these models for various languages. Despite this, pretraining LLMs from scratch for non-English languages remains underexplored. This is the case for Italian, where no truly open-source research has investigated the pretraining process. To address this gap, we introduce Minerva (https://nlp.uniroma1.it/minerva), the first family of LLMs trained entirely from scratch on native Italian texts. Our work is the first investigation into the challenges and opportunities of pretraining LLMs specifically for the Italian language, offering insights into vocabulary design, data composition, and model development. With Minerva, we demonstrate that building an LLM tailored to a specific language yields numerous practical benefits over adapting existing multilingual models, including greater control over the model's vocabulary and the composition of its training data. We provide an overview of the design choices, pretraining methods, and evaluation metrics used to develop Minerva, which shows promising performance on Italian benchmarks and downstream tasks. Moreover, we share the lessons learned throughout Minerva's development to support the academic and industrial communities in advancing non-English LLM research. We believe that Minerva serves as an important step towards closing the gap in high-quality, open-source LLMs for non-English languages.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Large Language Models (LLMs) have revolutionized the way Natural Language Processing (NLP) tasks are approached, achieving remarkable results in existing areas and opening the door to entirely new research directions and applications. As a result, the energy and resources dedicated to the study and creation of LLMs are growing exponentially. However, most LLMs -both closedand open-source -are predominantly designed for English, posing significant challenges and limitations for their use in non-English settings. In practice, generating Italian text using multilingual or language-adapted English models, e.g., from Mistral <ref type="bibr" target="#b0">[1]</ref> or Llama <ref type="bibr" target="#b1">[2,</ref><ref type="bibr" target="#b2">3]</ref>, is computationally more expensive and often less effective compared to using a model specifically designed for the Italian language. This inefficiency stems from the vocabulary of an English or multilingual LLM -i.e., the lexical units, or tokens, that the model can use to compose text -when it is not optimized for the Italian language, resulting in Italian words being split into an excessive number of tokens. Consequently, this creates longer sequences of tokens, slower generation times, and higher computational costs, especially since many popular attention mechanisms have a quadratic complexity with respect to sequence length.</p><p>Efforts to create language-specific LLMs are increasing, and fall primarily into two main categories: i) adapting existing English-centric LLMs to other languages, and ii) training LLMs from scratch. The advantages of adapting existing English-centric LLMs to other languages are enticing: starting with a proven model can reduce the computational requirements, and adaptation can be achieved with relatively modest amounts of data. There are several language adaptation techniques, which range from fine-tuning the model on data for the target language <ref type="bibr" target="#b3">[4,</ref><ref type="bibr" target="#b4">5]</ref> to modifying the model's architecture <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8]</ref>, making these techniques flexible for different budgets and objectives. However, these techniques may not fully capture language-specific nuances and can degrade the performance in the original language, indeed an undesirable effect. Alternatively, training LLMs from scratch provides the freedom to make design choices tailored to the linguistic features of the target language-including morphology, lexicon, syntax, and semantics-which are often overlooked in English-centric models <ref type="bibr" target="#b8">[9]</ref>. It also allows for incorporating culturally relevant content, reducing biases that might be present in models primarily trained on English data, thus leading to more inclusive and accurate representations of language use. Unfortunately, while there are several efforts on adapting English-centric LLMs to the Italian language, e.g., Llamantino-2 <ref type="bibr" target="#b3">[4]</ref>, Llamantino-3 <ref type="bibr" target="#b4">[5]</ref>, DanteLLM <ref type="bibr" target="#b9">[10]</ref>, and Camoscio <ref type="bibr" target="#b10">[11]</ref>, inter alia, there is no truly open-source endeavor exploring what can be achieved by training an LLM from scratch on Italian data.</p><p>With this work, we follow the latter path and introduce Minerva, the first family of LLMs designed specifically for the Italian language and pretrained on Italian text. <ref type="foot" target="#foot_0">1</ref>We present the design choices for our models, our data processing, and the evaluation results regarding our Minerva LLMs, showing that our models -with 350M, 1B, 3B, and 7B parameters -outperform comparable multilingual models and even rival larger models adapted for Italian. We conclude with a discussion on the benefits and challenges of pretraining LLMs from scratch for the Italian language, sharing our experience and findings to provide valuable insights for the academic and industrial communities interested in training non-English LLMs from scratch. Lastly, we describe the technical details of Minerva-7B, our latest model with 7.4 billion parameters, for which we share our initial results.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Building a Pretraining Dataset for Italian LLMs</head><p>The field of LLMs is growing at an astonishing pace, with new models, datasets, benchmarks, and techniques presented every week. However, over the past few months, academic and industrial researchers have increasingly recognized the fundamental role of the data used to pretrain LLMs. Unsurprisingly, the majority of the leading companies are not releasing their training data as they seek to maintain an advantage over the competition, with very few exceptions (e.g. OLMo by AllenAI <ref type="bibr" target="#b11">[12]</ref> and OpenELM by Apple <ref type="bibr" target="#b12">[13]</ref>). In this section, we describe the different sources of data used in the training of the Minerva models, and Table <ref type="table" target="#tab_0">1</ref> provides an overview of these (cf. Appendix A for more details). Most importantly, the training datasets we used are entirely available online, making our process transparent and allowing researchers to better study the connection between pretraining data and model behavior.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Data Sources</head><p>The training data for our Minerva models consists of three main categories: Italian, English, and code data. Datasets used to train Minerva with their languages (second column) and number of tokens (third to sixth columns).</p><p>We only use the code data to train our largest model, i.e., Minerva-7B.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.1.">Italian Data</head><p>Web data. The majority of the text used to train LLMs is sourced from Web-scraped data, typically from Com-monCrawl (CC). Therefore, a significant portion of Italian text included in our training datasets is also of this nature, inherently exposing our models to potential biases and toxic content commonly found on the Web. Because preprocessing techniques, such as language identification, perplexity filtering, deduplication, and content classification are computationally expensive, the most sensible choice is thus to rely on preprocessed collections, such as CulturaX <ref type="bibr" target="#b13">[14]</ref> and RedPajama v2 <ref type="bibr" target="#b14">[15]</ref>. These collections already include Italian data, and have undergone various levels of filtering and deduplication, as discussed in Section 2.2.</p><p>Curated data. While Penedo et al. <ref type="bibr" target="#b15">[16]</ref> suggest that high-quality Web data is sufficient on its own to train LLMs, curated data sources are often used to further improve the model performance and introduce a broader diversity of data types, such as encyclopedic and academic text <ref type="bibr" target="#b16">[17]</ref>, as well as scientific and math-related text. Therefore, we include curated texts from several sources, including Wikipedia (encyclopedic/world knowledge data), EurLex and Gazzetta Ufficiale (law, economics, and politics), and the Gutenberg Project (novels, poetry, etc.).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.2.">English Data</head><p>Web data. Mirroring our approach with the Italian data, we use preprocessed collections of English data from the Web. Given that English is the most popular language on the Internet and has been the primary focus of LLM research, there are numerous options that already provide a large amount of tokens from filtered, deduplicated, and cleaned sources. For our Minerva-350M, 1B, and 3B models, we collect data from the English partition of CulturaX, capping the number of tokens to the same amount as the Italian ones, as shown in Table <ref type="table" target="#tab_0">1</ref>. Instead, to train Minerva-7B, we use a portion of FineWeb <ref type="bibr" target="#b17">[18]</ref>, which includes filtered and deduplicated CC dumps with various timestamps. Specifically, we use the CC dumps from 2023-14 to 2024-18 to match the total number of tokens in the Italian Web partition of our training data.</p><p>Curated sources. We include the 5.3B tokens from the English Wikipedia and 7B tokens from the copyrightfree books in Project Gutenberg. Additionally, we include data from arXiv and StackExchange, which are included in the RedPajama dataset.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.3.">Code Data</head><p>Previous work has highlighted the importance of including source code in the pretraining corpus of an LLM, in order to improve not only its code understanding and generation, but also its general reasoning capabilities <ref type="bibr" target="#b18">[19]</ref> even for tasks that do not directly involve or require programming. Therefore, for our largest model -Minerva-7B -we also include a portion of code data. More specifically, we extract 200B tokens from The Stack V2 <ref type="bibr" target="#b19">[20]</ref>, selecting the data from their deduplicated partition, which includes 17 of the most popular programming languages on GitHub.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Data Preprocessing</head><p>As mentioned above, our preprocessing effort remains minimal, as we rely on the preprocessing pipelines used in CulturaX, RedPajama, and FineWeb. To evaluate the content and quality of our training data, we employ the methodology described in Elazar et al. <ref type="bibr" target="#b20">[21]</ref> to analyze the URL domain distribution within the Italian partition of CulturaX and RedPajama, as these partitions had never been utilized in training an LLM prior to Minerva. We provide an overview of our analysis together with a few insights in Appendix B.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Data Filtering and Deduplication</head><p>Previous work on English-centric LLMs <ref type="bibr" target="#b21">[22]</ref> has already emphasized the importance of training LLMs on "clean" data. Two of the most important parts of data cleaning are filtering, i.e., removing content that does not satisfy a set of criteria, and deduplication, i.e., removing portions of text that appear too often so as to minimize memorization.</p><p>As mentioned above, for the corpus used to train the Minerva models, we rely mainly on collections of data that has already been filtered and deduplicated. However, there are some minor considerations that depend on each collection of data. More specifically, we use CulturaX as-is, relying on their filtering and deduplication pipeline. Unfortunately, RedPajama v2 is not filtered and deduplicated; however, its data is tagged with meta-information that can be used to apply filtering and deduplication. Such metadata includes, for example, the perplexity score of each text computed via a language model trained on Wikipedia, which is used to partition RedPajama v2 into three partitions: head, middle, tail. For our training corpus, we only include a document if it is classified as head or middle according to its perplexity score. Moreover, we use the precomputed metadata to remove exact duplicates and apply fuzzy deduplication. The latter is performed by using the hash provided for each document with Locality Sensitive Hashing and Jaccard similarity 0.7 to decide whether two documents are fuzzy duplicates. Note that we only apply fuzzy deduplication within each CC dump, rather than across all the dumps. This decision is motivated by two observations: first, applying fuzzy deduplication across all CC dumps is computationally expensive; second, previous work <ref type="bibr" target="#b17">[18]</ref> has shown that per-CC deduplication is not only sufficient, but is also beneficial, when training English LLMs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Minerva LLMs</head><p>In this section, we provide an overview of the Minerva LLMs: we describe their tokenizers, the design choices behind the model architecture, and how we trained the resulting LLMs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Vocabulary and Tokenizers</head><p>The vocabulary of an LLM is mainly impacted by its size, i.e., the number of tokens in the vocabulary itself, and how the tokenizer is trained, i.e., which tokens make up the vocabulary. These two factors impact the fertility of the resulting tokenizer, which measures the average number of tokens (subwords) into which a word is split. Tokenizers with lower fertility are preferable, as the input and output sequences they produce are shorter, resulting in an efficiency gain, especially as most attention mechanisms are quadratic with respect to the sequence length. Unsurprisingly, the vocabulary allocation of an English-centric LLM minimizes the fertility of English text, and results in high fertility values for Italian text, as shown in Table <ref type="table" target="#tab_2">2</ref>.  Given the importance for our Minerva LLMs of having a low fertility on Italian text, we intentionally train the Minerva tokenizer on a balanced mix of English and Italian data (and code data for the 7B model). Our analysis shows that this strategy leads to a much improved fertility on Italian data, while at the same time maintaining similar fertility on English data. More specifically, for Minerva-350M/1B/3B, we opted for a vocabulary size similar to that of Mistral-7B (around 32k tokens): in this case, the fertility of the Minerva tokenizer is ~20% better than the Mistral tokenizer on the Italian Wikipedia and only ~1% worse on the English Wikipedia. Following recent trends in LLMs, for Minerva-7B, we increased the vocabulary size to around 50k tokens, which resulted in a further fertility improvement of ~6% and ~5% on the Italian and English Wikipedias, respectively, notwithstanding the addition of code data to the training data. We provide more details on the tokenizer in Appendix C.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Model Architecture</head><p>While the field of LLMs is moving rapidly, one of the best models when our efforts started was Mistral. Therefore, our Minerva LLMs are based on Mistral's model architecture. The Minerva LLMs are, therefore, a family of decoder-only transformer models, with a few standout features, such as grouped-query attention (GQA) <ref type="bibr" target="#b22">[23]</ref>, which boosts inference speed and reduces memory requirements for increased throughput, and sliding window attention (SWA) <ref type="bibr" target="#b23">[24,</ref><ref type="bibr" target="#b24">25]</ref>, which manages longer sequences more efficiently at reduced computational costs. Specifically, the GQA is configured to share one key-value pair every four queries, while the SWA configuration handles up to 2,048 tokens with a maximum context length of 16,384 tokens. We build four models with different sizes by scaling the number of attention heads, hidden size, intermediate size, and hidden layers, while maintaining a ratio of ~3.5 between the hidden size and intermediate size, as in the original Mistral model. However, following the more recent model releases by Mistral, Minerva-7B does not use SWA. Instead, it implements full attention across its entire context length, which can extend up to 4096 tokens, i.e., double the number of tokens for the SWA used in Minerva-350M/1B/3B. The parameters for each model size are detailed in Table <ref type="table" target="#tab_4">3</ref>, for which we provide a more in-depth description in Appendix D.</p><p>Building Minerva on top of Mistral's model architecture also brings other benefits, such as broad compatibility with the ecosystem of libraries, frameworks, and tools that has emerged over recent months, including llama.cpp <ref type="bibr" target="#b25">[26]</ref>, FlashAttention <ref type="bibr" target="#b26">[27]</ref>, and vLLM <ref type="bibr" target="#b27">[28]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Model Training</head><p>We train all the Minerva LLMs using MosaicML's LLM Foundry. <ref type="foot" target="#foot_1">2</ref> The training process is conducted on the Leonardo Supercomputer<ref type="foot" target="#foot_2">3</ref> hosted and maintained by CINECA. Each node in Leonardo is equipped with 4 × custom NVIDIA A100 SXM4 with 64GB of VRAM.</p><p>All our models are trained using the AdamW optimizer <ref type="bibr" target="#b28">[29]</ref> with 𝛽 1 = 0.9, 𝛽 2 = 0.95, 𝑒𝑝𝑠 = 10 −8 (with the only exception being Minerva-7B, which is trained using 𝑒𝑝𝑠 = 10 −5 ) on a standard causal language modeling training objective. To smooth the training process, we follow standard practice in the literature and employ a warmup-then-cooldown learning rate scheduling. More specifically, we first increase the learning rate linearly during the initial training phase (2% of the total number of training steps for Minerva-350M/1B/3B and 0.3% for Minerva-7B) until the peak learning rate is reached (2×10 −4 for Minerva-350M/1B/3B, 3×10 −4 for Minerva-7B), and then decrease the learning rate with a cosine scheduling until the end of the training process. The hyperparameters used for each model are shown in Table <ref type="table">7</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Evaluation</head><p>We measure the 0-shot performance of our Minerva LLMs on ITA-Bench <ref type="bibr" target="#b29">[30]</ref>, a suite of benchmarks that have been created either by translating existing benchmarks from other languages, or by adapting existing Italian benchmarks so that they can be used for LLM evaluation. ITA-Bench includes a set of 10 benchmarks commonly used to evaluate LLMs, namely, ARC Challenge (ARC-C), ARC Easy (ARC-E) <ref type="bibr" target="#b30">[31]</ref>, BoolQ <ref type="bibr" target="#b31">[32]</ref>, GSM8K <ref type="bibr" target="#b32">[33]</ref>, HellaSwag (HS) <ref type="bibr" target="#b33">[34]</ref>, MMLU <ref type="bibr" target="#b34">[35]</ref>, PIQA <ref type="bibr" target="#b35">[36]</ref>, SciQ <ref type="bibr" target="#b36">[37]</ref>, TruthfulQA <ref type="bibr" target="#b37">[38]</ref>, and Winogrande (WG) <ref type="bibr" target="#b38">[39]</ref>. Overall, these benchmarks offer a comprehensive view of the capabilities of an LLM on a wide variety of aspects, including scientific knowledge, world knowledge (e.g., geography, politics, economics), commonsense knowledge, physical  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 4</head><p>Zero-shot evaluation results of the Minerva models on a set of standard benchmarks translated from English to Italian.</p><p>interactions, coreference, and math reasoning, among others. Employing automatically-translated benchmarks is far from ideal, but it allows us to better compare the scores obtained in Italian with those obtained in English, while awaiting as the Italian research community develops Italian-specific benchmarks <ref type="bibr" target="#b39">[40]</ref>.</p><p>As shown in Table <ref type="table">4</ref>, the average performance of the Minerva models increases steadily with the model size. For our 3B model, we also provide a comparison with two models of the same size: XGLM <ref type="bibr" target="#b40">[41]</ref>, a multilingual LLM by META, and OpenELM <ref type="bibr" target="#b41">[42]</ref>, a very recent Englishonly model developed by Apple. Our evaluation shows that Minerva-3B outperforms XGLM and OpenELM by a significant margin, i.e., +4.4% and +3.7% on average.</p><p>Finally, Minerva-7B achieves the highest performance among the Minerva LLMs family, as expected. Notably, Minerva-7B, achieves a higher average score than Llamantino-2. This is an interesting comparison because the pretraining data for Llama-2, i.e., the pretrained LLM used to build Llamantino-2, is not available and has never been disclosed, making the model open-weights but not entirely open-source. 4 When compared to closedsourced LLMs such as Mistral-7B-v0.1 or Llama-3.1-8B, Minerva still lags behind in some tasks, such as BoolQ or GSM8K, which may require better reasoning capabilities and/or more pretraining data. As we can observe from Figure <ref type="figure" target="#fig_0">1</ref>, which tracks the progress of Minerva-7B 4</p><p>We stress that, for Llamantino-2, only the data that has been used for the language adaptation process is available, whereas the pretraining data is not.</p><p>on ITA-Bench every 10,000 training steps, the model is still slowly improving towards the end of the pretraining phase, suggesting that a larger training corpus or multiple epochs may be beneficial in future developments.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Downstream tasks</head><p>In this section, we show the results of the Minerva models when adapted to two downstream applications. This analysis is particularly relevant for Minerva-350M and Minerva-1B, which can be utilized for specific tasks rather than as general-purpose models, offering lower computational costs. The tasks in this analysis include: i) Italian Abstractive News Summarization, and ii) Machine Translation, in both directions (IT-EN and EN-IT).</p><p>News Summarization. Following Sarti and Nissim <ref type="bibr" target="#b42">[43]</ref>, we fine-tune Minerva models (up to 3B) on a concatenation of two Italian news summarization datasets: Fanpage.it and Il Post newspapers <ref type="bibr" target="#b43">[44]</ref>. A detailed overview of the hyperparameters used to train our models is provided in Appendix E. We can find that Minerva-3B obtains the best results (0.30 vs 0.29 of the second best in terms of Rouge-L); however, it is not as parameterefficient as IT5-Large, probably because encoder-decoder models are more suitable for fine-tuning than decoderonly models <ref type="bibr" target="#b44">[45]</ref>. In Table <ref type="table">8</ref>, we report the full results of Minerva fine-tuned on the aforementioned datasets and compared to baselines in Sarti and Nissim <ref type="bibr" target="#b42">[43]</ref>, which Machine Translation. We also evaluate our Minerva LLMs in few-shot <ref type="bibr" target="#b45">[46]</ref> machine translation on two benchmarks, FLORES <ref type="bibr" target="#b46">[47]</ref> and OPUS-100 <ref type="bibr" target="#b47">[48]</ref>. We explore how LLMs perform this task relying only on in-contextlearning few-shot examples, reporting our results with 5-shot prompting. We rely on the vLLM library <ref type="bibr" target="#b27">[28]</ref> and change the default parameters with temperature=0 and max_tokens=512.</p><p>We highlight that Minerva-3B reaches competitive results in MT in both EN-IT (84.8 on Flores and 76.7 on Opus in terms of COMET score) and IT-EN (85.7 and 78.0). Compared with other models of similar size, Minerva-3B shows strong results when the target language is Italian (+1.7 and +2.7 compared to Gemma-2B and Qwen-1.5B on Opus). Minerva-7B further showcases this by achieving the highest performance among models tested when translating from English into Italian. The full results are reported in Table <ref type="table" target="#tab_6">5</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusion and Future Work</head><p>In this paper, we demonstrated the feasibility and benefits of pretraining Italian language models from scratch, which not only improves the computational efficiency and performance of an LLM for a target language but reduce linguistic biases inherited from English training corpora <ref type="bibr" target="#b48">[49]</ref>. The Minerva models (https://nlp.uniroma1.it/  minerva) showcase promising results on a variety of Italian benchmarks and downstream tasks, including news summarization and machine translation. Most importantly, we describe, for the first time, the process of creating an Italian pretraining corpus with more than 1T tokens, and we share findings and insights into the pretraining process of Italian LLMs with the academic and industrial communities, paving the way for future research in training non-English language models. We hope that our contributions will represent a stepping stone for future work on language-specific and multilingual large-scale language modeling.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 6</head><p>Detailed breakdown of each dataset.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>B. Dataset Insights</head><p>We leveraged the WIMBD<ref type="foot" target="#foot_3">5</ref> library to compute word counts per URL domain on CulturaX. We decided not to do this for RedPajama v2 or FineWeb as their original data already provides token count and other insights into the dataset distribution. Figures <ref type="figure">2 and 3</ref> show the aggregation of word counts per domain for Italian and English, respectively.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>C. Tokenizer</head><p>We trained two tokenizers for Minerva. The first one is shared by the three smaller sizes, 350M, 1B and 3B. It is trained on a mix of 4GB of Italian text data and 4GB of English text data, both from CulturaX. Our objective is to have a balanced vocabulary across the two languages, mirroring the training data. We use the SentencePiece library <ref type="foot" target="#foot_4">6</ref> to train a BPE tokenizer and we apply byte fallback. We set a vocabulary size of 32,768 as a multiple of 8, which is recommended by some GPU architectures.</p><p>For the 7B tokenizer, we increase the vocabulary size to account for the inclusion of code data, up to 51,200. We also train a BPE tokenizer <ref type="foot" target="#foot_5">7</ref> with 4GB of English text, 4GB of Italian and 1GB of code. The text data is sampled from the training mix of datasets for the 7B, as reported in Table <ref type="table" target="#tab_0">1</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>D. Model</head><p>The Minerva LLM family consists of four models, each sharing the same underlying architecture, i.e., that of Mistral-7B. The models are differentiated by their size, ranging from 350 million parameters of Minerva-350M to 7 billion parameters of the largest model, Minerva-7B. The Minerva family also includes Minerva-1B and Minerva-3B, with 1 billion and 3 billion parameters, respectively. More specifically, the Minerva-7B model is based directly on the Mistral-7B architecture, with the sole modifications being the vocabulary size, which we increase to 51,200 tokens, and the context length, which is set to 4,096 tokens without activating the sliding window attention feature. Hence, Minerva-7B is structured as a decoder-only transformer model, comprising 32 layers. Each layer includes 32 attention heads, where each keyvalue pair is shared among four queries. Additionally, the model features feed-forward layers with a hidden size of 4096 and an intermediate size of 14336, which is 3.5 times the hidden size. Minerva-3B is a scaled down version of Minerva-7B, and it shares similar features with Mistral-7B, including a maximum context length of 16,384 tokens, sliding window attention spanning 2,048 tokens, and a vocabulary size of 32,768 tokens. To achieve approximately 3 billion parameters, we have reduced the hidden size to 2560 and the intermediate size to 8960. Minerva-1B and Minerva-350M differ from their larger counterpart in several key respects. Both models have 16 attention heads, in contrast to the higher count in the larger model. Additionally, the hidden and intermediate sizes of the feed-forward layers is reduced further: Minerva-1B features a hidden size of 2048 and an intermediate size of 7168, while Minerva-350M has a hidden size of 1152 and an intermediate size of 4032. The complete list of parameters is reported in Table <ref type="table" target="#tab_4">3</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>E. News Summarization</head><p>Additional results. Table <ref type="table">8</ref> reports the full results of our evaluation on news summarization.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1:Tracking the progress of Minerva-7B during its pretraining process. Here, we report the average accuracy on ITA-Bench every 10,000 steps, i.e., every 40B tokens approximately.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc></figDesc><table><row><cell>Dataset</cell><cell></cell><cell cols="4">Minerva -Model Size</cell></row><row><cell>Name</cell><cell>Lang.</cell><cell>350M</cell><cell>1B</cell><cell>3B</cell><cell>7B</cell></row><row><cell>RedPajama-V2</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>894B</cell></row><row><cell>CulturaX</cell><cell>Italian</cell><cell>35B</cell><cell>100B</cell><cell>330B</cell><cell>237B</cell></row><row><cell>Wikipedia</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>1.3B</cell></row><row><cell>Gutenberg</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>0.15B</cell></row><row><cell>Wikisource</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>0.12B</cell></row><row><cell>EurLex</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>1.6B</cell></row><row><cell>Gazzetta Ufficiale</cell><cell>Italian</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>1.7B</cell></row><row><cell>FineWeb</cell><cell>English</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>1,076B</cell></row><row><cell>CulturaX</cell><cell>English</cell><cell>35B</cell><cell>100B</cell><cell>330B</cell><cell>-</cell></row><row><cell>Wikipedia</cell><cell>English</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>5.3B</cell></row><row><cell>ArXiv</cell><cell>English</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>33B</cell></row><row><cell>Gutenberg</cell><cell>English</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>7B</cell></row><row><cell>StackExchange</cell><cell>English</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>22B</cell></row><row><cell>The Stack V2</cell><cell>Code</cell><cell>-</cell><cell>-</cell><cell>-</cell><cell>201B</cell></row><row><cell>Total # of tokens</cell><cell></cell><cell>70B</cell><cell cols="2">200B 660B</cell><cell>2.48T</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 2</head><label>2</label><figDesc>Fertility rates (lower is better) for Minerva tokenizers compared to other LLMs. The fertility rates are computed on a randomly sampled collection of texts from CulturaX and Wikipedia in both Italian (Ita) and English (Eng).</figDesc><table /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Model Params Layers Hidden Size Inter. Size Att. Heads KV Heads SW Length Ctx. Length</head><label></label><figDesc></figDesc><table><row><cell>Minerva-350M</cell><cell>352M</cell><cell>16</cell><cell>1152</cell><cell>4032</cell><cell>16</cell><cell>4</cell><cell>2048</cell><cell>16,384</cell></row><row><cell>Minerva-1B</cell><cell>1.01B</cell><cell>16</cell><cell>2048</cell><cell>7168</cell><cell>16</cell><cell>4</cell><cell>2048</cell><cell>16,384</cell></row><row><cell>Minerva-3B</cell><cell>2.89B</cell><cell>32</cell><cell>2560</cell><cell>8960</cell><cell>32</cell><cell>8</cell><cell>2048</cell><cell>16,384</cell></row><row><cell>Minerva-7B</cell><cell>7.40B</cell><cell>32</cell><cell>4096</cell><cell>14336</cell><cell>32</cell><cell>8</cell><cell>None</cell><cell>4,096</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 3</head><label>3</label><figDesc>Overview of the main hyperparameters for our Minerva models. We include the number of parameters (approximately, 350M, 1B, 3B, and 7B) and the corresponding number of layers, hidden size, intermediate size, attention heads, key-value heads, sliding window length, and maximum context length.</figDesc><table><row><cell>Size Name</cell><cell cols="4">ARC-C ARC-E BoolQ GSM8K</cell><cell>HS</cell><cell cols="6">MMLU PIQA SciQ TQA WG AVG</cell></row><row><cell>0.4B Minerva-350M-base-v1.0</cell><cell>24.6</cell><cell>36.4</cell><cell>60.7</cell><cell>48.2</cell><cell>32.6</cell><cell>25.7</cell><cell>59.5</cell><cell>63.7</cell><cell>46.5</cell><cell>58.4</cell><cell>45.6</cell></row><row><cell>1B Minerva-1B-base-v1.0</cell><cell>26.6</cell><cell>42.2</cell><cell>57.1</cell><cell>49.7</cell><cell>39.6</cell><cell>27.0</cell><cell>62.9</cell><cell>73.5</cell><cell>44.6</cell><cell>60.0</cell><cell>48.3</cell></row><row><cell>3B OpenELM-3B</cell><cell>27.0</cell><cell>37.9</cell><cell>60.9</cell><cell>49.7</cell><cell>40.7</cell><cell>28.3</cell><cell>56.7</cell><cell>81.8</cell><cell>47.3</cell><cell>58.4</cell><cell>48.9</cell></row><row><cell>3B XGLM-2.9B</cell><cell>27.5</cell><cell>41.4</cell><cell>59.1</cell><cell>65.7</cell><cell>44.5</cell><cell>27.4</cell><cell>59.9</cell><cell>77.8</cell><cell>43.1</cell><cell>60.2</cell><cell>50.6</cell></row><row><cell>3B Minerva-3B-base-v1.0</cell><cell>31.4</cell><cell>49.1</cell><cell>62.1</cell><cell>55.8</cell><cell>52.9</cell><cell>29.2</cell><cell>66.9</cell><cell>79.9</cell><cell>41.4</cell><cell>62.2</cell><cell>53.1</cell></row><row><cell>7B OLMo-7B-0724-hf</cell><cell>30.7</cell><cell>44.0</cell><cell>72.9</cell><cell>52.5</cell><cell>47.9</cell><cell>30.9</cell><cell>58.7</cell><cell>85.1</cell><cell>44.6</cell><cell>61.2</cell><cell>52.8</cell></row><row><cell>7B LLaMAntino-2-7b</cell><cell>33.7</cell><cell>50.8</cell><cell>70.9</cell><cell>52.2</cell><cell>54.9</cell><cell>33.8</cell><cell>64.4</cell><cell>86.1</cell><cell>44.3</cell><cell>64.1</cell><cell>55.5</cell></row><row><cell>7B Minerva-7B-base-v1.0</cell><cell>42.0</cell><cell>68.8</cell><cell>79.5</cell><cell>50.0</cell><cell>62.6</cell><cell>36.2</cell><cell>69.8</cell><cell>87.7</cell><cell>38.5</cell><cell>65.0</cell><cell>60.0</cell></row><row><cell>7B Mistral-7B-v0.1</cell><cell>42.8</cell><cell>61.3</cell><cell>78.2</cell><cell>56.1</cell><cell>60.4</cell><cell>38.0</cell><cell>65.5</cell><cell>90.8</cell><cell>43.5</cell><cell>68.8</cell><cell>60.5</cell></row><row><cell>8B Llama-3.1-8B</cell><cell>44.0</cell><cell>61.1</cell><cell>78.0</cell><cell>57.8</cell><cell>62.9</cell><cell>38.7</cell><cell>67.7</cell><cell>90.3</cell><cell>43.0</cell><cell>69.2</cell><cell>61.3</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_6"><head>Table 5</head><label>5</label><figDesc>COMET scores measure the translation capabilities of our Minerva models and other LLMs on the FLORES and OPUS datasets. This evaluation is conducted in a 5-shot setting, where each model receives five random translation examples from the development set before the test instance.</figDesc><table /></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">https://nlp.uniroma1.it/minerva</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">https://github.com/mosaicml/llm-foundry</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">https://leonardo-supercomputer.cineca.eu/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_3">https://github.com/allenai/wimbd</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_4">https://github.com/google/sentencepiece</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="7" xml:id="foot_5">https://huggingface.co/docs/tokenizers/en/api/trainers</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>Edoardo Barba, Simone Conia and Pere-Lluís Huguet Cabot are fully funded by the PNRR MUR project PE0000013-FAIR. Roberto Navigli acknowledges the support of the CREATIVE PRIN project. The authors acknowledge the CINECA award IsB28_medit under the ISCRA initiative for the availability of high-performance computing resources and support.</p></div>
			</div>


			<div type="availability">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Code Code</head><p>https://huggingface.co/datasets/bigcode/the-stack-v2-train-smol-ids</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A. Data sources</head><p>Table <ref type="table">6</ref> shows the source of each dataset used to train Minerva in its different sizes. The Tokens column shows the total number of tokens we used from each dataset. Where Table <ref type="table">1</ref> shows more tokens used for training, it means they were resampled from the total in order to reach that number. All these datasets are openly licensed. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 7</head><p>Training configuration for various Minerva models.</p><p>Additional details on the experimental setup. To finetune our Minerva models we relied on the SFTTrainer class. 8 The hyperparameters we used are reported in Table <ref type="table">9</ref>. We sought to be in-line with the decisions taken in <ref type="bibr" target="#b42">[43]</ref>. We also tried out different combinations, but we noticed that the best evaluation scores are given by the  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>F. Few-shot Machine Translation</head><p>Here, we provide more details on our experimental setup for the Machine Translation task. In our experiments, we test the capability of a base model (i.e., with no instruction fine-tuning or task-specific fine-tuning) to translate a sentence from English to Italian and vice versa. Previously, LLMs have been shown to perform well in machine translation and they now rival task-specific MT systems on a number of benchmarks <ref type="bibr" target="#b49">[50]</ref> and tasks <ref type="bibr" target="#b50">[51]</ref>. In our case, we prompt the language models by providing a set of 5 randomly sampled English-to-Italian translations (and vice-versa for the Italian-to-English translation). Finally, we measure the translation performance of the models using COMET, a learned metric to assess the quality between an automatic translation and a gold ref-erence, as COMET has shown better correlation with human judgement than other metrics, such as BLEU. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Model</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 9</head><p>Hyper-parameters used to fine-tune our models.</p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title/>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">Q</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sablayrolles</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mensch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bamford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">S</forename><surname>Chaplot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>De Las Casas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Bressand</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lengyel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lample</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Saulnier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">R</forename><surname>Lavaud</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-A</forename><surname>Lachaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Stock</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">L</forename><surname>Scao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lavril</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lacroix</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">E</forename><surname>Sayed</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2310.06825.arXiv:2310.06825" />
	</analytic>
	<monogr>
		<title level="j">Mistral</title>
		<imprint>
			<biblScope unit="volume">7</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<author>
			<persName><forename type="first">H</forename><surname>Touvron</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lavril</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Izacard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Martinet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-A</forename><surname>Lachaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lacroix</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Rozière</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Hambro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Azhar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rodriguez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Joulin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Grave</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lample</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2302.13971.arXiv:2302.13971" />
		<title level="m">Llama: Open and efficient foundation language models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<author>
			<persName><forename type="first">H</forename><surname>Touvron</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Stone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Albert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Almahairi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Babaei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Bashlykov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Batra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Bhargava</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bhosale</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Bikel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Blecher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Ferrer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Cucurull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Esiobu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fernandes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Fuller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Goswami</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Hartshorn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hosseini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Hou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Inan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kardas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kerkez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Khabsa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Kloumann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Korenev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S</forename><surname>Koura</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-A</forename><surname>Lachaux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lavril</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Liskovich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Mao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Martinet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mihaylov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mishra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Molybog</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Nie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Poulton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Reizenstein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rungta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Saladi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Schelten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Silva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">M</forename><surname>Smith</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Subramanian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><forename type="middle">E</forename><surname>Tan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Taylor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Williams</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">X</forename><surname>Kuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Zarov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kambadur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rodriguez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Stojnic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Edunov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Scialom</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2307.09288.arXiv:2307.09288" />
		<title level="m">Llama 2: Open foundation and fine-tuned chat models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<monogr>
		<title level="m" type="main">Llamantino: Llama 2 models for effective text generation in italian language</title>
		<author>
			<persName><forename type="first">P</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Musacchio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Polignano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Siciliani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Fiameni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Semeraro</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2312.09993.arXiv:2312.09993" />
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Polignano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Semeraro</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2405.07101</idno>
		<title level="m">Advanced natural-based interaction for the italian language: Llamantino-3-anita</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<title level="m" type="main">Efficient language model training through cross-lingual and progressive transfer learning</title>
		<author>
			<persName><forename type="first">M</forename><surname>Ostendorff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Rehm</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2301.09626</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">FOCUS: Effective embedding initialization for monolingual specialization of multilingual models</title>
		<author>
			<persName><forename type="first">K</forename><surname>Dobler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>De Melo</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.emnlp-main.829</idno>
		<ptr target="https://aclanthology.org/2023.emnlp-main.829.doi:10.18653/v1/2023.emnlp-main.829" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">H</forename><surname>Bouamor</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Pino</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Bali</surname></persName>
		</editor>
		<meeting>the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="13440" to="13454" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<author>
			<persName><forename type="first">Z</forename><surname>Csaki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Pawakapan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Du</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">U</forename><surname>Thakker</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2404.05829</idno>
		<title level="m">Sambalingo: Teaching large language models new languages</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Faysse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Fernandes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Guerreiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Loison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Alves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Corro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Boizard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Alves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Martins</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2402.00786</idno>
		<title level="m">Croissantllm: A truly bilingual french-english language model</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">DanteLLM: Let&apos;s push Italian LLM research forward!</title>
		<author>
			<persName><forename type="first">A</forename><surname>Bacciu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Campagnano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Trappolini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Silvestri</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/2024.lrec-main.388" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
				<editor>
			<persName><forename type="first">N</forename><surname>Calzolari</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">M.-Y</forename><surname>Kan</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">V</forename><surname>Hoste</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Lenci</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Sakti</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Xue</surname></persName>
		</editor>
		<meeting>the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)<address><addrLine>Torino, Italia</addrLine></address></meeting>
		<imprint>
			<publisher>ELRA and ICCL</publisher>
			<date type="published" when="2024">2024</date>
			<biblScope unit="page" from="4343" to="4355" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Santilli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Rodolà</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2307.16456</idno>
		<title level="m">Camoscio: an italian instruction-tuned llama</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<monogr>
		<author>
			<persName><forename type="first">D</forename><surname>Groeneveld</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Beltagy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Walsh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bhagia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Kinney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Tafjord</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">H</forename><surname>Jha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ivison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Magnusson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Arora</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Atkinson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Authur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Chandu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cohan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dumas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Elazar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hessel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Khot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Merrill</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Morrison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Muennighoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Naik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Nam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">E</forename><surname>Peters</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Pyatkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ravichander</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schwenk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Smith</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Strubell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Subramani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wortsman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dasigi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Lambert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Richardson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dodge</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Soldaini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">A</forename><surname>Smith</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hajishirzi</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2402.00838.arXiv:2402.00838" />
		<title level="m">Olmo: Accelerating the science of language models</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">H</forename><surname>Sekhavat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Horton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Mirzadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Najibi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Belenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zatloukal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rastegari</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2404.14619.arXiv:2404.14619" />
		<title level="m">Openelm: An efficient language model family with open training and inference framework</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<author>
			<persName><forename type="first">T</forename><surname>Nguyen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">V</forename><surname>Nguyen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><forename type="middle">D</forename><surname>Lai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Man</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">T</forename><surname>Ngo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Dernoncourt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Rossi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">H</forename><surname>Nguyen</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2309.09400</idno>
		<title level="m">Culturax: A cleaned, enormous, and multilingual dataset for large language models in 167 languages</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<monogr>
		<author>
			<persName><forename type="first">T</forename><surname>Computer</surname></persName>
		</author>
		<ptr target="https://github.com/togethercomputer/RedPajama-Data" />
		<title level="m">Redpajama: an open dataset for training large language models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<author>
			<persName><forename type="first">G</forename><surname>Penedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Malartic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Hesslow</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Cojocaru</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cappelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Alobeidli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Pannier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Almazrouei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Launay</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2306.01116</idno>
		<ptr target="https://arxiv.org/abs/2306.01116.arXiv:2306.01116" />
		<title level="m">The RefinedWeb dataset for Falcon LLM: outperforming curated corpora with web data, and web data only</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title level="m" type="main">Croissantllm: A truly bilingual french-english language model</title>
		<author>
			<persName><forename type="first">M</forename><surname>Faysse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Fernandes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">M</forename><surname>Guerreiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Loison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Alves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Corro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Boizard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Alves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">H</forename><surname>Martins</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">B</forename><surname>Casademunt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Yvon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">F T</forename><surname>Martins</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Viaud</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hudelot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Colombo</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2402.00786.arXiv:2402.00786" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title level="m" type="main">The fineweb datasets: Decanting the web for the finest text data at scale</title>
		<author>
			<persName><forename type="first">G</forename><surname>Penedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Kydlíček</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">B</forename></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lozhkov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mitchell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Raffel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">V</forename><surname>Werra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wolf</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2406.17557.arXiv:2406.17557" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<author>
			<persName><forename type="first">P</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Bommasani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Tsipras</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Soylu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yasunaga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Narayanan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kumar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Newman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Yan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">A</forename><surname>Cosgrove</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">D</forename><surname>Manning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Re</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Acosta-Navas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">A</forename><surname>Hudson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Zelikman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Durmus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Ladhak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Rong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Santhanam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Orr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yuksekgonul</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Suzgun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Guha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">S</forename><surname>Chatterji</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Khattab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Henderson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Chi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Santurkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ganguli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Hashimoto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Icard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Mai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Koreeda</surname></persName>
		</author>
		<ptr target="Ex-pertCertification" />
	</analytic>
	<monogr>
		<title level="m">Holistic evaluation of language models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Lozhkov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">B</forename><surname>Allal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Cassano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lamy-Poirier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tazi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Pykhtar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Tian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kocetkov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zucker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Belkada</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Abulkhanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Paul</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Risdal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">Y</forename><surname>Zhuo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Zheltonozhskii</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">O O</forename><surname>Dade</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Krauß</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Jain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Su</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Abati</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Muennighoff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Oblokulov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Akiki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Marone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Mou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mishra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Hui</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Dao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zebaze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Dehaene</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Patry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mcauley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Scholak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Paquet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Robinson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Anderson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Chapados</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Patwary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Tajbakhsh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jernite</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Ferrandis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hughes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wolf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Guha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Werra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Vries</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2402.19173</idno>
		<title level="m">Starcoder 2 and the stack v2: The next generation</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">What&apos;s in my big data?</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Elazar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bhagia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><forename type="middle">H</forename><surname>Magnusson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ravichander</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schwenk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Suhr</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">P</forename><surname>Walsh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Groeneveld</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Soldaini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Singh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hajishirzi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">A</forename><surname>Smith</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dodge</surname></persName>
		</author>
		<ptr target="https://openreview.net/forum?id=RvfPnOkPV4" />
	</analytic>
	<monogr>
		<title level="m">The Twelfth International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">web data, and web data only</title>
		<author>
			<persName><forename type="first">G</forename><surname>Penedo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Malartic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Hesslow</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Cojocaru</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cappelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Alobeidli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Pannier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Almazrouei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Launay</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2306.01116.arXiv:2306.01116" />
	</analytic>
	<monogr>
		<title level="m">The refinedweb dataset for falcon llm: Outperforming curated corpora with</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Ainslie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee-Thorp</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Jong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zemlyanskiy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Lebrón</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Sanghai</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2305.13245</idno>
		<title level="m">Gqa: Training generalized multi-query transformer models from multihead checkpoints</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<author>
			<persName><forename type="first">R</forename><surname>Child</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1904.10509</idno>
		<title level="m">Generating long sequences with sparse transformers</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b24">
	<monogr>
		<author>
			<persName><forename type="first">I</forename><surname>Beltagy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">E</forename><surname>Peters</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cohan</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2004.05150</idno>
		<title level="m">Longformer: The long-document transformer</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b25">
	<monogr>
		<title level="m" type="main">cpp: Inference of meta&apos;s llama model (and others) in pure c/c++, ??</title>
		<author>
			<persName><forename type="first">G</forename><surname>Gerganov</surname></persName>
		</author>
		<ptr target="https://github.com/ggerganov/llama.cpp" />
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">FlashAttention-2: Faster attention with better parallelism and work partitioning</title>
		<author>
			<persName><forename type="first">T</forename><surname>Dao</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations (ICLR)</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">Efficient memory management for large language model serving with pagedattention</title>
		<author>
			<persName><forename type="first">W</forename><surname>Kwon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhuang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">H</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Gonzalez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Stoica</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles</title>
				<meeting>the ACM SIGOPS 29th Symposium on Operating Systems Principles</meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<monogr>
		<author>
			<persName><forename type="first">I</forename><surname>Loshchilov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Hutter</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1711.05101</idno>
		<title level="m">Decoupled weight decay regularization</title>
				<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<author>
			<persName><forename type="first">L</forename><surname>Moroni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Conia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Martelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Navigli</surname></persName>
		</author>
		<title level="m">ITA-Bench: Towards a more comprehensive evaluation for Italian LLMs</title>
				<imprint>
			<publisher>CLiC-it</publisher>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<author>
			<persName><forename type="first">P</forename><surname>Clark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Cowhey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Etzioni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Khot</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sabharwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Schoenick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Tafjord</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1803.05457</idno>
		<title level="m">Think you have solved question answering? try arc, the ai2 reasoning challenge</title>
				<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b31">
	<monogr>
		<author>
			<persName><forename type="first">C</forename><surname>Clark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-W</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Kwiatkowski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Collins</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Toutanova</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1905.10044</idno>
		<title level="m">Boolq: Exploring the surprising difficulty of natural yes/no questions</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b32">
	<monogr>
		<author>
			<persName><forename type="first">K</forename><surname>Cobbe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kosaraju</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bavarian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Plappert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Tworek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hilton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Nakano</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2110.14168</idno>
		<title level="m">Training verifiers to solve math word problems</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b33">
	<monogr>
		<title level="m" type="main">Hellaswag: Can a machine really finish your sentence?</title>
		<author>
			<persName><forename type="first">R</forename><surname>Zellers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Holtzman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Bisk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Farhadi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1905.07830</idno>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">Measuring massive multitask language understanding</title>
		<author>
			<persName><forename type="first">D</forename><surname>Hendrycks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Burns</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Basart</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mazeika</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Steinhardt</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the International Conference on Learning Representations</title>
				<meeting>the International Conference on Learning Representations</meeting>
		<imprint>
			<publisher>ICLR</publisher>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">Piqa: Reasoning about physical commonsense in natural language</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Bisk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Zellers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the AAAI conference on artificial intelligence</title>
				<meeting>the AAAI conference on artificial intelligence</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="volume">34</biblScope>
			<biblScope unit="page" from="7432" to="7439" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Welbl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">F</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Gardner</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1707.06209</idno>
		<title level="m">Crowdsourcing multiple choice science questions</title>
				<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b37">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hilton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Evans</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2109.07958</idno>
		<title level="m">Truthfulqa: Measuring how models mimic human falsehoods</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b38">
	<analytic>
		<title level="a" type="main">Winogrande: An adversarial winograd schema challenge at scale</title>
		<author>
			<persName><forename type="first">K</forename><surname>Sakaguchi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">L</forename><surname>Bras</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bhagavatula</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Communications of the ACM</title>
		<imprint>
			<biblScope unit="volume">64</biblScope>
			<biblScope unit="page" from="99" to="106" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b39">
	<monogr>
		<title level="m" type="main">Disce aut deficere: Evaluating llms proficiency on the INVALSI Italian benchmark</title>
		<author>
			<persName><forename type="first">F</forename><surname>Mercorio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mezzanzanica</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Potertì</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Serino</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Seveso</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2406.17535.arXiv:2406.17535" />
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b40">
	<analytic>
		<title level="a" type="main">Fewshot learning with multilingual generative language models</title>
		<author>
			<persName><forename type="first">X</forename><forename type="middle">V</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mihaylov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Artetxe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Simig</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ott</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bhosale</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Du</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Pasunuru</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shleifer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S</forename><surname>Koura</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>O'horo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Kozareva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Diab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Stoyanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Li</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2022.emnlp-main.616</idno>
		<ptr target="https://aclanthology.org/2022.emnlp-main.616.doi:10.18653/v1/2022.emnlp-main.616" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">Y</forename><surname>Goldberg</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">Z</forename><surname>Kozareva</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</editor>
		<meeting>the 2022 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics<address><addrLine>Abu Dhabi, United Arab Emirates</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="9019" to="9052" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b41">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">H</forename><surname>Sekhavat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Horton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Mirzadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Najibi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Belenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zatloukal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rastegari</surname></persName>
		</author>
		<idno>arXiv.org</idno>
		<ptr target="https://arxiv.org/abs/2404.14619v1" />
		<title level="m">OpenELM: An Efficient Language Model Family with Open Training and Inference Framework</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b42">
	<monogr>
		<author>
			<persName><forename type="first">G</forename><surname>Sarti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2203.03759</idno>
		<title level="m">It5: Large-scale text-to-text pretraining for italian language understanding and generation</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b43">
	<analytic>
		<title level="a" type="main">Two new datasets for italian-language abstractive text summarization</title>
		<author>
			<persName><forename type="first">N</forename><surname>Landro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Gallo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>La Grassa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Federici</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Information</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page">228</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b44">
	<monogr>
		<author>
			<persName><forename type="first">Z</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Lam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M</forename></persName>
		</author>
		<author>
			<persName><forename type="first">.-C</forename><surname>So</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Collier</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2304.04052</idno>
		<title level="m">Decoder-only or encoder-decoder? interpreting language model as a regularized encoderdecoder</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b45">
	<analytic>
		<title level="a" type="main">The unreasonable effectiveness of few-shot learning for machine translation</title>
		<author>
			<persName><forename type="first">X</forename><surname>Garcia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Bansal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Cherry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Foster</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Krikun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Johnson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Firat</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Machine Learning</title>
				<meeting><address><addrLine>PMLR</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="10867" to="10878" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b46">
	<analytic>
		<title level="a" type="main">The flores-101 evaluation benchmark for low-resource and multilingual machine translation</title>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P.-J</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Wenzek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Ju</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Krishnan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ranzato</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Guzmán</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Transactions of the Association for Computational Linguistics</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page" from="522" to="538" />
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b47">
	<monogr>
		<author>
			<persName><forename type="first">B</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Williams</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Titov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Sennrich</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2004.11867</idno>
		<title level="m">Improving massively multilingual neural machine translation and zero-shot translation</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b48">
	<analytic>
		<title level="a" type="main">Biases in large language models: Origins, inventory, and discussion</title>
		<author>
			<persName><forename type="first">R</forename><surname>Navigli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Conia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Ross</surname></persName>
		</author>
		<idno type="DOI">10.1145/3597307</idno>
		<ptr target="https://doi.org/10.1145/3597307.doi:10.1145/3597307" />
	</analytic>
	<monogr>
		<title level="j">J. Data and Information Quality</title>
		<imprint>
			<biblScope unit="volume">15</biblScope>
			<biblScope unit="page" from="1" to="21" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b49">
	<analytic>
		<title level="a" type="main">Increasing coverage and precision of textual information in multilingual knowledge graphs</title>
		<author>
			<persName><forename type="first">S</forename><surname>Conia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">U</forename><surname>Minhas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Ilyas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.emnlp-main.100</idno>
		<ptr target="https://aclanthology.org/2023.emnlp-main.100.doi:10.18653/v1/2023.emnlp-main.100" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
		<title level="s">Association for Computational Linguistics</title>
		<editor>
			<persName><forename type="first">H</forename><surname>Bouamor</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Pino</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Bali</surname></persName>
		</editor>
		<meeting>the 2023 Conference on Empirical Methods in Natural Language Processing<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="1612" to="1634" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b50">
	<analytic>
		<title level="a" type="main">Towards cross-cultural machine translation with retrieval-augmented generation from multilingual knowledge graphs</title>
		<author>
			<persName><forename type="first">S</forename><surname>Conia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">U</forename><forename type="middle">F</forename><surname>Minhas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Potdar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2410.14057" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
				<meeting>the 2024 Conference on Empirical Methods in Natural Language Processing<address><addrLine>Miami, Florida, USA</addrLine></address></meeting>
		<imprint>
			<publisher>Association for Computational Linguistics</publisher>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
