<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">CALAMITA: Challenge the Abilities of LAnguage Models in ITAlian</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Giuseppe</forename><surname>Attanasio</surname></persName>
							<email>giuseppe.attanasio@lx.it.pt</email>
							<affiliation key="aff0">
								<orgName type="institution">Instituto de Telecomunicações</orgName>
								<address>
									<settlement>Lisbon</settlement>
									<country key="PT">Portugal</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Pierpaolo</forename><surname>Basile</surname></persName>
							<email>pierpaolo.basile@uniba.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Bari &quot;Aldo Moro&quot;</orgName>
								<address>
									<settlement>Bari</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Federico</forename><surname>Borazio</surname></persName>
							<email>borazio@ing.uniroma2.it</email>
							<affiliation key="aff2">
								<orgName type="institution">University of Rome &quot;Tor Vergata&quot;</orgName>
								<address>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Danilo</forename><surname>Croce</surname></persName>
							<affiliation key="aff2">
								<orgName type="institution">University of Rome &quot;Tor Vergata&quot;</orgName>
								<address>
									<settlement>Rome</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Maria</forename><surname>Francis</surname></persName>
							<email>maria.francis287@gmail.com</email>
							<affiliation key="aff3">
								<orgName type="laboratory">CLCG</orgName>
								<orgName type="institution">University of Groningen</orgName>
								<address>
									<settlement>Groningen</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="institution">University of Trento</orgName>
								<address>
									<settlement>Trento</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Jacopo</forename><surname>Gili</surname></persName>
							<email>jacopo.gili584@edu.unito.it</email>
							<affiliation key="aff5">
								<orgName type="department">Computer Science Department</orgName>
								<orgName type="institution">University of Turin</orgName>
								<address>
									<settlement>Turin</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Elio</forename><surname>Musacchio</surname></persName>
							<email>elio.musacchio@phd.unipi.it</email>
							<affiliation key="aff1">
								<orgName type="institution">University of Bari &quot;Aldo Moro&quot;</orgName>
								<address>
									<settlement>Bari</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Malvina</forename><surname>Nissim</surname></persName>
							<email>m.nissim@rug.nl</email>
							<affiliation key="aff3">
								<orgName type="laboratory">CLCG</orgName>
								<orgName type="institution">University of Groningen</orgName>
								<address>
									<settlement>Groningen</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Viviana</forename><surname>Patti</surname></persName>
							<email>viviana.patti@unito.it</email>
							<affiliation key="aff5">
								<orgName type="department">Computer Science Department</orgName>
								<orgName type="institution">University of Turin</orgName>
								<address>
									<settlement>Turin</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Matteo</forename><surname>Rinaldi</surname></persName>
							<email>matteo.rinaldi@unito.it</email>
							<affiliation key="aff5">
								<orgName type="department">Computer Science Department</orgName>
								<orgName type="institution">University of Turin</orgName>
								<address>
									<settlement>Turin</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Daniel</forename><surname>Scalena</surname></persName>
							<email>d.scalena@campus.unimib.it</email>
							<affiliation key="aff3">
								<orgName type="laboratory">CLCG</orgName>
								<orgName type="institution">University of Groningen</orgName>
								<address>
									<settlement>Groningen</settlement>
									<country key="NL">The Netherlands</country>
								</address>
							</affiliation>
							<affiliation key="aff6">
								<orgName type="institution">University of Milan Bicocca</orgName>
								<address>
									<settlement>Milan</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff7">
								<orgName type="department">Tenth Italian Conference on Computational Linguistics</orgName>
								<address>
									<addrLine>Dec 04 -06</addrLine>
									<postCode>2024</postCode>
									<settlement>Pisa</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">CALAMITA: Challenge the Abilities of LAnguage Models in ITAlian</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">0FE3C9D9DDC5714911D3E41ABC2C15DF</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:35+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Italian Benchmark, Shared Task, Language Models Orcid 0000-0001-6945-3698 (G. Attanasio)</term>
					<term>0000-0002-0545-1105 (P. Basile)</term>
					<term>0009-0000-0193-2131 (F. Borazio)</term>
					<term>0000-0001-9111-1950 (D. Croce)</term>
					<term>0009-0007-7638-9963 (M. Francis)</term>
					<term>0009-0007-1343-3760 (J. Gili)</term>
					<term>0009-0006-9670-9998 (E. Musacchio)</term>
					<term>0000-0001-5289-0971 (M. Nissim)</term>
					<term>0000-0001-5991-370X (V. Patti)</term>
					<term>0009-0004-7488-8855 (M. Rinaldi)</term>
					<term>0009-0006-0518-6504 (D. Scalena)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>The rapid development of Large Language Models (LLMs) has called for robust benchmarks to assess their abilities, track progress, and compare iterations. While existing benchmarks provide extensive evaluations across diverse tasks, they predominantly focus on English, leaving other languages underserved. For Italian, the EVALITA campaigns have provided a long-standing tradition of classification-focused shared tasks. However, their scope does not fully align with the nuanced evaluation required for modern LLMs. To address this gap, we introduce "Challenge the Abilities of LAnguage Models in ITAlian" (CALAMITA), a collaborative effort to create a dynamic and growing benchmark tailored to Italian. CALAMITA emphasizes diversity in task design to test a wide range of LLM capabilities through resources natively developed in Italian by the community. This initiative includes a shared platform, live leaderboard, and centralized evaluation framework. This paper outlines the collaborative process, initial challenges, and evaluation framework of CALAMITA.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>In parallel with the ongoing and constant development of new Large Language Models (LLMs), it has increased the need for understanding their abilities, how they differ from one another, and how they improve compared to previous iterations. To meet this need, the last couple of years have witnessed multiple efforts to put together new-or revisiting existing-benchmarks against which the performance and progress of LLMs can be monitored. These benchmarks include different tasks to test a variety of characteristics and abilities that are assumed to be associated with LLMs at different degrees. To mention a few, these span from multiple-choice questions of various sorts, commonsense and mathematical reasoning, and a variety of linguistic phenomena. BIGbench <ref type="bibr" target="#b0">[1]</ref> is currently the largest and most comprehensive benchmark, including over 200 tasks, almost all in English, which have been collaboratively contributed by researchers across the globe.</p><p>However, benchmarking progress for languages other than English has not improved with comparable quality. In many cases, evaluation datasets are automatic translations of their English counterparts, yielding not only a less native and possibly ungrammatical language but also a cultural picture that is distant from the target language.</p><p>In the Italian NLP landscape, there is a long tradition of evaluation through the contribution of shared tasks. These benchmarks have been collected and run for almost 20 years in the context of the EVALITA campaigns (https://www.evalita.it/). The campaigns have fostered the creation of training and evaluation resources and models natively developed for Italian. Based on such resources, UINAUIL (Unified Interactive Natural Understanding of the Italian Language) <ref type="bibr" target="#b1">[2]</ref>, an integrated benchmark for Italian NLU including six tasks has been recently proposed, and tested with available Italian and multilingual language models. Except for CHANGE-IT <ref type="bibr" target="#b2">[3]</ref>, a generation task focused on headline transformation and organized within the EVALITA 2020 edition, all EVALITA tasks have focused on classification problems (some have been recast as generation problems as part of a resource release within the "Risorse per la Lingua Italiana" (RiTA) community <ref type="bibr" target="#b3">[4]</ref>). However, to improve upon existing benchmarks, we wanted the core of a dynamic reference benchmark for Italian to include new tasks specifically focused on testing LLMs' abilities.</p><p>Therefore, in the steps of this solid Italian benchmarking tradition, and in line with the most recent developments regarding the evaluation of LLMs, AILC-the Italian Association for Computational Linguistics-has launched "Challenge the Abilities of LAnguage Models in ITAlian" (CALAMITA), a large-scale collaborative initiative across the whole Italian NLP community to develop a dynamic and growing benchmark for evaluating LLMs' capabilities in Italian. This strategy would ensure a high diversity of tasks and, thus, of tested capabilities. It would distribute the effort of creative resources natively in Italian across many researchers and practitioners.</p><p>In the long term, we aim to establish a continuously growing suite of tasks that can be accessed through a shared platform and a live leaderboard so that any newly developed LLM, either multilingual or Italian monolingual, can be readily assessed. In the short term, we have started to build the CALAMITA benchmark through a series of challenges collaboratively contributed by the research community (Section 2). Also, we have established an evaluation framework that enables running the current and possibly future challenges in a centralized and coherent manner. This short paper summarises the collaborative procedure, the challenges currently included in CALAMITA 1 , and the evaluation procedure.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Collaborative Methodology</head><p>The CALAMITA approach is inspired by standard Natural Language Processing shared tasks, giving the benchmark 1 The CALAMITA website: https://clic2024.ilc.cnr.it/calamita/.</p><p>a strong collaborative nature. The Italian Association for Computational Linguistics (AILC, https://www.ai-lc.it) launched a public call, mainly aimed at the Italian NLP community but spread across the standard international communication channels, asking for challenges and corresponding datasets, that LLMs could be tested on.</p><p>Participants contributing to a challenge were expected to provide an explanation and motivation for a given challenge, as well as a dataset that reflects that challenge. It was also asked to provide any information relevant to the dataset (provenance, annotation, distribution of labels or phenomena, etc.) Evaluation metrics and examples were also expected, along with the task and dataset submission. Existing relevant datasets could also be submitted as long as they made an interesting contribution to the benchmark and were natively created in Italian. To standardize the contribution to the CALAMITA benchmark, all proposed tasks with existing or new datasets had to follow a predefined template created and distributed by the CALAMITA organizers.</p><p>Creating the CALAMITA benchmark and the first round of LLM evaluation required several steps. In the first phase, all prospective participants submitted a preproposal. In case of a positive evaluation, based on compliance with the requirements and balance across submissions -participants were then asked to submit the final and complete challenge, following the provided CALAMITA template, in phase two. A final report was also requested for each accepted task, providing information on implementing the code for the evaluation.</p><p>The data and evaluation team set up the final CALAMITA benchmark by compiling the data and code of all the proposed tasks. We forked the Language Model Evaluation Harness tool<ref type="foot" target="#foot_0">2</ref> to create a custom CALAMITA version by including all the accepted tasks. Once the benchmark was assembled, the CALAMITA organizers ran zero-or few-shot experiments with a selection of LLMs. No tuning materials or experiments are expected at this project stage. Also, while we expect that CALAMITA, in the longer run, will be further populated by additional tasks and will have its own publicly accessible leaderboard, allowing for model testing, in this first stage, the choice of LLMs to be evaluated and the evaluation procedure is centralized.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Challenges</head><p>The preliminary call for tasks yielded the submission of over 20 proposals. Almost all of them were retained and are part of the present CALAMITA challenge, apart from the proposals that aimed at testing abilities that LLMs should not be expected to have, such as abilities typical of information retrieval engines and the proposals that</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Ability tested</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Description Count</head><p>Commonsense knowledge General knowledge about the world that is typically taken for granted in everyday life, e.g., everyday cause-and-effect relationships, situational judgments, physical properties, and basic social interactions.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>19</head><p>Factual knowledge Knowledge of concrete, verifiable facts about the world, e.g., definitions, historical events, or scientific concepts.</p><p>12 Linguistic knowledge Linguistically motivated tasks that test specific language skills, e.g., word sense disambiguation, coreference resolution, or acceptability judgment.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>22</head><p>Formal reasoning Ability to understand and use formally logical principles to solve problems, e.g., mathematical problems. 9</p><p>Fairness and bias Evaluates a model's capacity to handle sensitive tasks, including exclusive and stereotyped language understanding and detecting offensive or biased language towards social groups.</p><p>6</p><p>Code generation Ability to generate fully functioning code for a specific programming language. 1 Machine translation Ability to translate a sentence from a source language into another language, with one of the two being Italian.</p><p>2 Summarization Ability to create relevant summaries of a given excerpt, e.g., news headline generation or news reduction.</p><p>2</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Categories of abilities tested by CALAMITA tasks. Tasks test general abilities such as knowledge about true facts, commonsense, and logical reasoning (top) or specific NLP-oriented abilities such as code generation or machine translation (bottom). Each task may require models to exhibit more than one ability.</p><p>required manual evaluation. In what follows, we briefly describe each task included in CALAMITA and refer the reader to each of the challenges' reports for further details. In Table <ref type="table">1</ref>, we describe the macro categories under which the CALAMITA tasks can be grouped, where categories are broad classes of tested abilities. Table <ref type="table">2</ref> shows which abilities apply to each challenge.</p><p>ABRICOT (ABstRactness and Inclusiveness in COn-texT) <ref type="bibr" target="#b4">[5]</ref> is a task designed to evaluate Italian language models on their ability to understand and assess the abstractness and inclusiveness of language, two nuanced features that humans naturally convey in everyday communication. Unlike binary categorizations such as abstract/concrete or inclusive/exclusive, these features exist on a continuous spectrum with varying degrees of intensity. The task is based on a manual collection of sentences that present the same noun phrase (NP) in different contexts, allowing its interpretation to vary between the extremes of abstractness and inclusiveness. This challenge aims to verify how LLMs perceive subtle linguistic variations and their implications in natural language. <ref type="bibr" target="#b5">[6]</ref> is a challenge consisting of three classification tasks in the context of argument mining in the legal domain. The tasks are based on a dataset of 225 Italian decisions on Value Added Tax, annotated to identify and categorize argumentative text. The objective of the first task is to classify each argumen-tative component as a premise or conclusion. In contrast, the second and third tasks aim at classifying the type of premise: legal vs factual, and its corresponding argumentation scheme. The classes are highly unbalanced, hence evaluation is based on the macro F1 score.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>AMELIA (Argument Mining Evaluation on Legal documents in ItAlian)</head><p>BEEP (BEst DrivEr's License Performer) <ref type="bibr" target="#b6">[7]</ref> is a benchmark to evaluate large language models in the context of a simulated Italian driver's license exam. This challenge tests the models' ability to understand and apply traffic laws, road safety regulations, and vehicle-related knowledge through a series of true/false questions. The dataset is derived from official ministerial materials used in the Italian licensing process, explicitly targeting Category B licenses. <ref type="bibr" target="#b7">[8]</ref> is a task made of linguistic puzzles (matrices) around languagerelated problems, focusing on formal and semantic properties of language. A BLM matrix consists of a context set and an answer set. The context is a sequence of sentences that encodes implicitly an underlying generative linguistic rule. The contrastive multiple-choice answer set includes negative examples following corrupted generating rules. The models are prompted in a few-shot setting. The datasets comprise a few prompts for a fewshot setting. <ref type="bibr" target="#b8">[9]</ref> is a task aimed at evaluating the proficiency of Large Language Models in extracting drug-specific information from Patient Information Leaflets. The challenge evaluates the effectiveness of processing complex medical information in Italian and is approached as an information extraction task in a zero-shot setting, based on the model's pre-existing knowledge or through in-context learning. Evaluation is performed against a manually created gold standard.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>BLM-It (Blackbird Language Matrices)</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>DIMMI (Drug InforMation Mining in Italian)</head><p>ECWCA (Educational CrossWord Clues Answering) <ref type="bibr" target="#b9">[10]</ref> is designed to evaluate the knowledge and reasoning capabilities of LLMs through crossword clue-answering. The challenge consists of two tasks: a standard questionanswering format where the LLM is asked to solve crossword clues and a variation where the model is given hints about the word lengths of the answers, which is expected to help models with reasoning abilities.</p><p>EurekaRebus <ref type="bibr" target="#b10">[11]</ref> is a task that tests the ability of LLMs to conduct multi-step, knowledge-intensive inferences while respecting predefined constraints. LLMs are prompted to reason step-by-step to solve verbalized variants of rebus games. Verbalized rebuses replace visual cues with crossword definitions to create an encrypted first pass, making the problem entirely text-based. Multiple metrics are used to grasp the models' performance in knowledge recall, constraints adherence, and re-segmentation abilities across reasoning steps.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>GATTINA (GenerAtion of TiTles for Italian News</head><p>Articles) <ref type="bibr" target="#b11">[12]</ref> is a task that aims to assess the ability of LLMs to generate headlines for science news articles. Aspects such as the appropriateness of the summary, creativity, and attractiveness are evaluated through a battery of metrics. The benchmark consists of a large dataset of science news articles and their corresponding published headlines from ANSA Scienza and Galileo, two prominent Italian media outlets.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>GEESE (Generating and Evaluating Explanations</head><p>for Semantic Entailment) <ref type="bibr" target="#b12">[13]</ref> is focused on evaluating the impact of generated explanations on the predictive performance of language models for the task of Recognizing Textual Entailment in Italian. Using a dataset enriched with human-written explanations, two large language models are employed to generate and utilize explanations for semantic relationships between sentence pairs. GEESE assesses the quality of generated explanations by measuring changes in prediction accuracy when explanations are provided. <ref type="bibr" target="#b13">[14]</ref> is a task designed to assess and monitor the recognition and generation of gender-fair language in both mono-and cross-lingual scenarios. It includes three tasks: (1) the detection of gender-marked expressions in Italian sentences, (2) the rewriting of gendered expressions into gender-fair alternatives, and (3) the generation of gender-fair language in automatic translation from English to Italian. The challenge relies on three different annotated datasets: the GFL-it corpus, which contains Italian texts extracted from administrative documents provided by the University of Brescia; GeNTE, a bilingual test set for genderneutral rewriting and translation built upon a subset of the Europarl dataset; Neo-GATE, a bilingual test set designed to assess the use of non-binary neomorphemes in Italian for both fair formulation and translation tasks.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>GFG (Gender-Fair Generation)</head><p>GITA (Graded Italian Annotated Dataset) <ref type="bibr" target="#b14">[15]</ref> investigates the physical commonsense reasoning capabilities of large language models, assessing their low-level understanding of the physical world using a test set in the Italian language. Three specific tasks are evaluated: identifying plausible and implausible stories within our dataset, identifying the conflict that generates an implausible story, and identifying the physical states that make a story implausible. It is written and annotated by a professional linguist.</p><p>INVALSI <ref type="bibr" target="#b15">[16]</ref> is a benchmark based on the Invalsi tests administered to students within the Italian school system. Expert pedagogists prepare these tests with the explicit goal of testing average students' performance over time across Italy. There are two benchmarks: Invalsi MATE (420 questions), which targets the models' performance on mathematical understanding, and Invalsi ITA (1279 questions), which evaluates language understanding in Italian.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>ITA-SENSE (ITAlian word SENSE disambiguation)</head><p>[17] is a task that assesses LLMs' abilities in understanding lexical semantics through Word Sense Disambiguation. The classical Word Sense Disambiguation task is cast as a generative problem formalized as two tasks:</p><p>[T1] Given a target word and a sentence in which the word occurs, generate the correct meaning definition;</p><p>[T2] Given a target word and a sentence in which the word occurs, choose the correct meaning definition from a predefined set. For CALAMITA, LLMs are tested in a zero-shot setting. <ref type="bibr" target="#b17">[18]</ref> is a task aimed at evaluating LLMs to differentiate between closely related action concepts based on textual descriptions alone. The challenge is inspired by the "find the intruder" task, where models must identify an outlier among a set of 4 sentences that describe similar yet distinct actions. The dataset highlights action-predicate mismatches, where the same verb may describe different actions, or different verbs may refer to the same action.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>MACID (Multimodal ACtion IDentification)</head><p>Although mono-modal (text-only), the task is designed for future multimodal integration, linking visual and textual representations to enhance action recognition. <ref type="bibr" target="#b18">[19]</ref> is a task that aims at testing the ability of LLMs in automatic translation, focusing on Italian and English (in both directions). The task proposes a benchmark composed of two datasets covering different domains and with varying distribution policies. Performances are reported in terms of four evaluation metrics, whose scores allow an overall evaluation of the quality of the automatically generated translations. <ref type="bibr" target="#b19">[20]</ref> is a large-scale Multi-Choice Question Answering (MCQA) dataset for evaluating the factual knowledge and reasoning abilities of LLMs in Italian. This contribution aims to counteract the disadvantages of using MCQA benchmarks that are automatically translated from English and may sound unnatural, contain errors, or use linguistics constructions that do not align with the target language. In addition, they may introduce topical and ideological biases reflecting Anglo-centric perspectives. Mult-IT comprises over 110,000 manually written questions sourced directly from preparation quizzes for Italian university entrance exams or for exams for public sector employment in Italy.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>MT (Machine Translation)</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Mult-IT</head><p>PejorativITy <ref type="bibr" target="#b20">[21]</ref> is a task to investigate misogyny expressed through neutral words that can assume a negative connotation when functioning as pejorative epithets. This challenge addresses a) the disambiguation of such ambiguous words in a given context; b) the detection of misogyny in instances that contain such polysemic words. The task is divided into two parts, both framed as a binary classification. In Task A, the model is asked to define if, given a tweet, the target word is used in a pejorative or non-pejorative way. In Task B, the model is asked whether the whole sentence is misogynous.</p><p>PERSEID (PERSpEctivist Irony Detection) <ref type="bibr" target="#b21">[22]</ref> considers the task of irony detection from short social media conversations collected from Twitter (X) and Reddit. Data is leveraged from MultiPICO, a recent multilingual dataset with disaggregated annotations and annotators' metadata. The dataset evaluates whether prompting LLMs with additional annotators' demographic information (gender only, age only, and the combination of the two) improves performance compared to a baseline in which only the input text is provided. <ref type="bibr" target="#b22">[23]</ref> is a benchmark designed to evaluate the ability of LLMs to comprehend a specific type of complex syntactic construction in Italian: object relative clauses. The challenge is framed as a binary entailment task where, given a complex sentence, the model is tasked with determining whether it logically entails a simpler yes/no implication.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>TRACE-it (Testing Relative clAuses Comprehension through Entailment in ITalian)</head><p>Termite <ref type="bibr" target="#b23">[24]</ref> focuses on the Text-to-SQL task in Italian. Natural language queries are written natively in Italian, and the models are expected to turn them into SQL queries. The dataset is built to be invisible to search engines since it is locked under an encryption key delivered along the resource to reduce accidental inclusion in upcoming training sets. It contains hand-crafted databases in different domains, each with a balanced set of NL-SQL query pairs. The NL questions are built in such a way that they can be solved by a model relying only on its linguistic proficiency and an analysis of the schema, with no external knowledge needed.</p><p>VeryfIT <ref type="bibr" target="#b24">[25]</ref> is designed to evaluate the in-memory factual knowledge of language models on data written by professional fact-checkers, posing it as a true or false question. Topics of the statements vary, but most are in specific domains related to the Italian government, policies, and social issues. The task presents several challenges: extracting statements from segments of speeches, determining appropriate contextual relevance both temporally and factually, and verifying the statements' accuracy.</p><p>ItaEval <ref type="bibr" target="#b25">[26]</ref> is a multifaceted evaluation suite comprising three overarching task categories: (i) natural language understanding, (ii) commonsense and factual knowledge, and (iii) bias, fairness, and safety <ref type="bibr" target="#b3">[4]</ref>. ItaEval is a collection of 18 tasks encompassing existing and new datasets. The so-compiled ItaEval suite provides a standardized, multifaceted framework for evaluating Italian language models, facilitating more rigorous and comparative assessments of model performance.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Evaluation Strategy</head><p>Rooted in its very nature, CALAMITA's biggest challenge is standardizing evaluation across many tasks and scenarios. To account for such high variability, we settled on a few fundamental choices that shape CALAMITA's core principles (Design choices) and left broad freedom to challenge participants to specify fine-grained aspects of their tasks (Participant choices). Base design choices shared across all tasks and high task-specific customization balance standardization and versatility. Abilities tested by each task in CALAMITA. * : task that require contextualized factual knowledge, e.g., reading comprehension tasks. * * : tasks that require stereotypical commonsense knowledge, e.g., understanding the concept of misogyny.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Task</head><p>Design choices. Following recent practices for language model evaluation [e.g., 27, 28], we consider every received task as a downstream task to be solved via standard prompting. We support two types of tasks: Multiple-Choice (MC) and Open-Ended (OE) generation. MC tasks require a model to pick one or more correct answers from a finite set. OE tasks require models to generate output tokens until a stopping criterion is met. For evaluating multiple-choice tasks, we rank all candidates by their likelihood conditioned on the prompt and pick the highest <ref type="bibr" target="#b28">[29]</ref>. We normalize each option probability by the number of tokens. Closed-question question-answering is an example of an MC task. We do not adopt a single strategy for OE tasks, as evaluation depends on the semantics of the output. Machine translation and summarization are examples of OE tasks. Moreover, we standardize the decoding strategy across OE tasks. We use beam search (𝑛 = 5) for machine translation and greedy decoding for all other tasks. See Appendix A for the complete details.</p><p>To foster reproducibility, we base CALAMITA's codebase on open-source tools. We forked and built our evaluation code upon lm-eval <ref type="bibr" target="#b29">[30]</ref>. When possible, we recommended public and accessible data release to the participants through the HuggingFace Hub. <ref type="foot" target="#foot_1">3</ref> We release our evaluation code at https://github.com/CALAMITA-AILC/ lm-evaluation-harness.</p><p>Participant choices. In addition to the data associated with the task and the type (MC or OE), we request that each participating team provides specifics regarding compiling an arbitrary prompt and evaluating an arbitrary model generation. Among prompting details, task proposers specified a prompt template and the number of task demonstrations (0 for zero-shot, N for Nshot prompting). In few-shot cases, we requested where to sample the demonstrations and the sampling strategy (static, dynamic-random, or dynamic-sequential). Among the evaluation details, we requested that participants specify any post-processing function for model raw outputs, one or more evaluation metrics, and relative information. For reporting purposes, we collected a single evaluation score (the first metric listed by proposers).</p><p>Crucially, we relied upon meta-description and code to streamline the communication between the task proposers and the challenge organizers. Participants were tasked to provide such information through a single file following a set of guidelines. <ref type="foot" target="#foot_2">4</ref> .</p><p>Model Selection. We tested Llama 3.1 8B Instruct <ref type="bibr" target="#b30">[31]</ref> and ANITA <ref type="bibr" target="#b31">[32]</ref>, two state-of-the-art decoder-only lan-guage models. Llama's 3.1 variant introduces multilingual support to the family's previous iteration. ANITA is a fine-tuned version of Llama 3 specializing in English and Italian tasks.</p><p>Our choice was driven by three primary reasons. First, both models are open-weight, well-known within the Italian NLP community, and explicitly support the Italian language. Second, they have been instruction fine-tuned, a training step that facilitates addressing tasks in zeroshot Third, they are within the 8 billion parameter range, which allows for fast iteration and good performance.</p><p>Results. At the time of writing, some of the results are still being collected. To provide a comprehensive and dynamic overview, we refer the reader to the external page where they get regularly updated: https: //calamita-ailc.github.io/calamita2024/.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Limitations</head><p>CALAMITA is not intended to be an exhaustive benchmark for testing abilities of Italian LLMs, especially at this first release. Considering the strong collaborative nature of this benchmark, coherence across tasks might not be optimal, in spite of the efforts put in by the organisers to uniform all datasets and the evaluation procedure. Although we have paid attention to this issue, we cannot be absolutely certain that none of the datasets, in one form or another, have ended up in some training set, already.</p></div>			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_0">https://github.com/EleutherAI/lm-evaluation-harness</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_1">Resulting from the effort for CALAMITA, 35 new datasets have been released with a permissive license.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_2">See the guidelines at https://github.com/CALAMITA-AILC/ calamita2024 and the information file at https://gist.github.com/ g8a9/f5e82d38ce12831323b20dc79b0452c9</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_3">https://www.hpc.cineca.it/systems/hardware/leonardo/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_4">https://calamita-ailc.github.io/calamita2024/</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>The ItaEval tasks submitted to CALAMITA are the result of a joint effort of members of the "Risorse per la Lingua Italiana" community (rita-nlp.org): we thank every member who dedicated their time to the project. For providing the computational resources we thank CINECA (ISCRA grant: HP10C3RW9F; ISCRA C grant: CALAMITA -HP10CKZDYT), the Center for Information Technology of the University of Groningen for their support and for providing access to the Hábrók high performance computing cluster and University of Turin for providing access to the HPC4AI cluster <ref type="bibr" target="#b32">[33]</ref>. Malvina Nissim's work is also part of the "Humane AI" theme of the Dutch Sectorplan for the Humanities. The work of Viviana Patti was is partially supported by "HARMONIA" project -M4-C2, I1.3 Partenariati Estesi -Cascade Call -FAIR -CUP C63C22000770006 -PE PE0000013 under the NextGenerationEU programme. The work by Giuseppe Attanasio was supported by the Portuguese Recovery and Resilience Plan through project C645008882-00000055 (Center for Responsible AI) and by Fundação para a Ciência e Tecnologia through contract UIDB/50008/2020. The work by Pierpaolo Basile and Elio Musacchio was supported by the PNRR project FAIR -Future AI Research (PE00000013), Spoke 6 -Symbiotic AI (CUP H97G22000210007) under the NRRP MUR program funded by the NextGenerationEU. The work of Matteo Rinaldi and Jacopo Gili has been partly supported by the Spoke "Future HPC &amp; Big Data" of the ICSC -Centro Nazionale di Ricerca in "High Performance Computing, Big Data and Quantum Computing", funded by European Union -NextGenerationEU.</p></div>
			</div>


			<div type="availability">
<div xmlns="http://www.tei-c.org/ns/1.0"><p>(D. Scalena) GLOBE https://gattanasio.cc/ (G. Attanasio); https://swap.di.uniba.it/members/basile.pierpaolo/ (P. Basile); https://github.com/crux82 (D. Croce); https://github.com/rosakun (M. Francis); https://github.com/Jj-source (J. Gili); https://github.com/m-elio (E. Musacchio); https://malvinanissim.github.io (M. Nissim); https://github.com/vivpatti (V. Patti); https://github.com/mrinaldi97 (M. Rinaldi); https://github.com/DanielSc4 (D. Scalena</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A. Experimental Details</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A.1. Technical Details</head><p>We run our experiments on the LEONARDO HPC infrastructure (Booster partition). The booster module partition is based on BullSequana XH2135 supercomputer nodes, each with four NVIDIA Tensor Core GPUs (custom Ampere A100 GPU 64GB HBM2e, NVLink 3.0 (200GB/s)) and a single Intel CPU. 5  We forked the lm-eval-harness official repository at the commit with hash b2bf7bc4a601c643343757c92c1a51eb69caf1d7.</p><p>We report all technical details on our official webpage. 6  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A.2. Generation Configuration</head></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Beyond the imitation game: Quantifying and extrapolating the capabilities of language models</title>
		<author>
			<persName><forename type="first">A</forename><surname>Srivastava</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kleyjo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Transactions on Machine Learning Research</title>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">UINAUIL: A unified benchmark for Italian natural language understanding</title>
		<author>
			<persName><forename type="first">V</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bioglio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bosca</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bosco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.acl-demo.33</idno>
		<ptr target="https://aclanthology.org/2023.acl-demo.33.doi:10.18653/v1/2023.acl-demo.33" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">D</forename><surname>Bollegala</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Huang</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Ritter</surname></persName>
		</editor>
		<meeting>the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), Association for Computational Linguistics<address><addrLine>Toronto, Canada</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="348" to="356" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<title level="m" type="main">Change-it@ evalita 2020: Change headlines, adapt news, generate, EVALITA Evaluation of NLP and Speech Tools for Italian</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">De</forename><surname>Mattei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cafagna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Dell'orletta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Gatt</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2020-12-17">December 17th. 2020</date>
			<biblScope unit="page">235</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Itaeval and tweetyita: A new extensive benchmark and efficiency-first language model for italian</title>
		<author>
			<persName><forename type="first">G</forename><surname>Attanasio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Delobelle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>La Quatra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Santilli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Savoldi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Tenth Italian Conference on Computational Linguistics</title>
				<meeting><address><addrLine>Location; Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-06-12">2024/12/04-2024/12/06. 2024</date>
		</imprint>
	</monogr>
	<note>CLiC-it 2024</note>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">ABRICOT -ABstRactness and Inclusiveness in COntexT: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Puccetti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Collacciani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">A</forename><surname>Ravelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Esuli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bolognesi</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">cember 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">AMELIA -Argument Mining Evaluation on Legal documents in ItAlian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Grundler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Galassi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Santin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fidelangeli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Galli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Palmieri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Lagioia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Sartor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Torroni</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">BEEP -BEst DrivEr&apos;s License Performer: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">F</forename><surname>Mercorio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Potertì</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Serino</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Seveso</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">BLM-It -Blackbird Language Matrices for Italian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">C</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Samo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Nastase</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Merlo</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">DIMMI -Drug InforMation Mining in Italian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">R</forename><surname>Manna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">P</forename><surname>Di Buono</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Giordano</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">ECWCA -Educational CrossWord Clues Answering A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">A</forename><surname>Zugarini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Zeinalipour</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fusco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Zanollo</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Eu-rekaRebus -Verbalized Rebus Solving with LLMs: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Sarti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Caselli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bisazza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">GATTINA -Gen-erAtion of TiTles for Italian News Articles: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">M</forename><surname>Francis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rinaldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>De Cosmo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Iannaccone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">GEESE -Generating and Evaluating Explanations for Semantic Entailment: a CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">A</forename><surname>Zaninello</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Magnini</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">GFG -Gender-Fair Generation: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">S</forename><surname>Frenda</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piergentili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Savoldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Madeddu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rosola</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Casola</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ferrando</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Negri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bentivogli</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">GITA4CALAMITA -Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Pensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Azurmendi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Etxaniz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Altuna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Gonzalez-Dios</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">INVALSI -Mathematical and Language Understanding in Italian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Puccetti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cassese</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Esuli</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">ITA-SENSE -Evaluate LLMs&apos; ability for ITAlian word SENSE disambiguation: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">P</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Musacchio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Siciliani</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">MACID -Multimodal ACtion IDentification: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">A</forename><surname>Ravelli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Varvara</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Gregori</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">MAGNET -MAchines GeNErating Translations: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">M</forename><surname>Cettolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piergentili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Papi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Gaido</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Negri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bentivogli</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">cember 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">Mult-IT Multiple Choice Questions on Multiple Topics in Italian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">M</forename><surname>Rinaldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Francis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Goffetti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">PejorativITy -In-Context Pejorative Language Disambiguation: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">A</forename><surname>Muti</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">PERSEID -Perspectivist Irony Detection: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">V</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Casola</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Frenda</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Lo</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">TRACE-it: Testing Relative clAuses Comprehension through Entailment in ITalian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">D</forename><surname>Brunato</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Termite Italian Text-to-SQL: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">F</forename><surname>Ranaldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">S</forename><surname>Ruzzetti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Onorati</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">M</forename><surname>Zanzotto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ranaldi</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">VeryfIT -Benchmark of Fact-Checked Claims for Italian: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">J</forename><surname>Gili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Passaro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Caselli</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">ItaEval: A CALAMITA Challenge</title>
		<author>
			<persName><forename type="first">G</forename><surname>Attanasio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>La Quatra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Santilli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Savoldi</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">OpenELM: An efficient language model family with open training and inference framework</title>
		<author>
			<persName><forename type="first">S</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">H</forename><surname>Sekhavat</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Horton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Jin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">I</forename><surname>Mirzadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Najibi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Belenko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zatloukal</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Workshop on Efficient Systems for Foundation Models II@ ICML2024</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">OLMo: Accelerating the science of language models</title>
		<author>
			<persName><forename type="first">D</forename><surname>Groeneveld</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Beltagy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Walsh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bhagia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Kinney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Tafjord</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">H</forename><surname>Jha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ivison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Magnusson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2024.acl-long.841</idno>
		<ptr target="https://aclanthology.org/2024.acl-long.841.doi:10.18653/v1/2024.acl-long.841" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics</title>
				<meeting>the 62nd Annual Meeting of the Association for Computational Linguistics<address><addrLine>Bangkok, Thailand</addrLine></address></meeting>
		<imprint>
			<publisher>Long Papers</publisher>
			<date type="published" when="2024">2024</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="15789" to="15809" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">Language models are few-shot learners</title>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">B</forename><surname>Brown</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ryder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Subbiah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kaplan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dhariwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Neelakantan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Shyam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Askell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Herbert-Voss</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Krueger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Henighan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Child</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Ziegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hesse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Sigler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Teusz Litwin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Clark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Berner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mccandlish</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Amodei</surname></persName>
		</author>
		<ptr target="https://proceedings.neurips.cc/paper_files/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 34th International Conference on Neural Information Processing Systems, NeurIPS &apos;20</title>
				<meeting>the 34th International Conference on Neural Information Processing Systems, NeurIPS &apos;20<address><addrLine>Red Hook, NY, USA</addrLine></address></meeting>
		<imprint>
			<publisher>Curran Associates Inc</publisher>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="1877" to="1901" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Biderman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schoelkopf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Sutawika</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Tow</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Abbasi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">F</forename><surname>Aji</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">S</forename><surname>Ammanamanchi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Black</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Clive</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2405.14782</idno>
		<title level="m">Lessons from the trenches on reproducible evaluation of language models</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Dubey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jauhri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pandey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kadian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Al-Dahle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Letman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mathur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Schelten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fan</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2407.21783</idno>
		<title level="m">The Llama 3 Herd of Models</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b31">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Polignano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Semeraro</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2405.07101</idno>
		<title level="m">Advanced natural-based interaction for the italian language: Llamantino-3-anita</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Hpc4ai, an ai-on-demand federated platform endeavour</title>
		<author>
			<persName><forename type="first">M</forename><surname>Aldinucci</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Rabellino</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Pironti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Spiga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Viviani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Drocco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Guerzoni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Boella</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mellia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Margara</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Drago</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Marturano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Marchetto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Piccolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bagnasco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lusso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Vallero</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Attardi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Barchiesi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Colla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Galeazzi</surname></persName>
		</author>
		<idno type="DOI">10.1145/3203217.3205340</idno>
		<ptr target="https://iris.unito.it/retrieve/handle/2318/1765596/689772/2018_hpc4ai_ACM_CF.pdf.doi:10.1145/3203217.3205340" />
	</analytic>
	<monogr>
		<title level="m">ACM Computing Frontiers</title>
				<meeting><address><addrLine>Ischia, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
