<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">GITA4CALAMITA -Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Giulia</forename><surname>Pensa</surname></persName>
							<email>giulia.pensa.tr@gmail.com</email>
							<affiliation key="aff0">
								<orgName type="department">EHU</orgName>
								<orgName type="institution">University of the Basque Country UPV</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ekhi</forename><surname>Azurmendi</surname></persName>
							<email>ekhi.azurmendi@ehu.eus</email>
							<affiliation key="aff1">
								<orgName type="department">HiTZ Center -Ixa</orgName>
								<orgName type="institution">University of the Basque Country UPV/EHU</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Julen</forename><surname>Etxaniz</surname></persName>
							<email>julen.etxaniz@ehu.eus</email>
							<affiliation key="aff1">
								<orgName type="department">HiTZ Center -Ixa</orgName>
								<orgName type="institution">University of the Basque Country UPV/EHU</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Begoña</forename><surname>Altuna</surname></persName>
							<email>begona.altuna@ehu.eus</email>
							<affiliation key="aff1">
								<orgName type="department">HiTZ Center -Ixa</orgName>
								<orgName type="institution">University of the Basque Country UPV/EHU</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Itziar</forename><surname>Gonzalez-Dios</surname></persName>
							<email>itziar.gonzalezd@ehu.eus</email>
							<affiliation key="aff1">
								<orgName type="department">HiTZ Center -Ixa</orgName>
								<orgName type="institution">University of the Basque Country UPV/EHU</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff2">
								<orgName type="institution">T Marco ha aperto il frigo</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff3">
								<orgName type="institution">Marco ha preso il latte dal frigo</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff4">
								<orgName type="department">Marco ha versato il latte nella tazza. Marco ha bevuto il latte</orgName>
								<orgName type="institution">Marco ha preso la tazza</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff5">
								<orgName type="department">Marco ha aperto il frigo</orgName>
								<orgName type="institution">Marco ha preso il latte dal frigo</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff6">
								<orgName type="institution">Marco ha preso la tazza</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff7">
								<orgName type="department">Marco ha bevuto il latte</orgName>
								<orgName type="institution">Marco ha versato il latte nella tazza</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff8">
								<orgName type="department">Marco ha preso il latte dal frigo</orgName>
								<orgName type="institution">Marco ha chiuso il frigo</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff9">
								<orgName type="institution">Marco ha preso la tazza</orgName>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff10">
								<orgName type="department">ha versato il latte nella tazza. Marco ha bevuto il latte</orgName>
								<orgName type="institution">Marco</orgName>
							</affiliation>
						</author>
						<title level="a" type="main">GITA4CALAMITA -Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">94F39287F948EBAF15175662BFC91C55</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:35+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Physical commonsense reasoning</term>
					<term>large language models</term>
					<term>Italian benchmark sentence 1 sentence 2 sentence 3 sentence 4 sentence 5</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>In the context of the CALAMITA Challenge, we investigate the physical commonsense reasoning capabilities of large language models (LLMs) and introduce a methodology to assess their understanding of the physical world. To this end, we use a test set designed to evaluate physical commonsense reasoning in LLMs for the Italian language. We present a tiered dataset, named the Graded Italian Annotated dataset (GITA), which is written and annotated by a professional linguist. This dataset enables us to focus on three distinct levels of commonsense understanding. Our benchmark aims to evaluate three specific tasks: identifying plausible and implausible stories within our dataset, identifying the conflict that generates an implausible story, and identifying the physical states that make a story implausible. We perform these tasks using LLAMA3, Gemma2 and Mistral. Our findings reveal that, although the models may excel at high-level classification tasks, their reasoning is inconsistent and unverifiable, as they fail to capture intermediate evidence.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Challenge: Introduction and Motivation</head><p>Physical commonsense understanding refers to the ability to comprehend the physical world and the events that transpire within it. This capability is a crucial component of human intelligence, enabling us to reason about our environment, anticipate future occurrences, and navigate our surroundings effortlessly, and recently there has been notable advancement in the development of large language models (LLMs) that can produce human-like language and execute a variety of language-related tasks.</p><p>LLMs have exhibited promising outcomes in grasping common sense in particular situations <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>. Nevertheless, it is widely recognized that the most precise evaluation of their capabilities is attained when assessing their performance in specific end tasks <ref type="bibr" target="#b2">[3,</ref><ref type="bibr" target="#b3">4]</ref>. The evaluation often emphasizes the capacity of LLMs to replicate relatively straightforward tasks, rather than their authentic proficiency in reasoning and comprehending language <ref type="bibr" target="#b5">[5,</ref><ref type="bibr" target="#b6">6]</ref>. As a result, there remains uncertainty regarding machines' ability to truly perform reasoning and whether the existing issues in this regard have been sufficiently addressed.</p><p>In this context, our aim is to contribute to this challenge developing an original Italian benchmark that can be used to assess the ability of language models to understand physical commonsense in a more truthful way, focusing not only on end tasks, but also on intermediate layer tasks.</p><p>In this paper, we present GITA4CALAMITA, the Graded Italian Annotated dataset for the CALAMITA challenge <ref type="bibr" target="#b7">[7]</ref>. GITA4CALAMITA is an adapted version of the GITA dataset proposed in <ref type="bibr" target="#b8">[8]</ref>. In particular, we decided to revise the physical states annotation and adapt it to this challenge. The first version of GITA dataset is available in our repository under the license CC BY-NC-SA 4.0. 1 . The GITA4CALAMITA dataset is manually compiled by a professional linguist, which allows for this multi-layered evaluation of the reasoning process. With the creation of an Italian dataset we gain the linguistic and cultural perspective of Italian, while commonsense research in Natural Language Processing (NLP) has largely been focused on the English language. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Challenge: Description</head><p>Our aim in this challenge is to assess the understanding of physical commonsense in LLMs for Italian. We configure our assessment proposal in the following terms:</p><p>1. given an original dataset of plausible/implausible stories related to physical commonsense, systems must identify the plausible and implausible stories; 2. systems must recognize the conflicting sentences that generate the conflict in implausible stories; 3. systems must spot the underlying physical states that cause conflict in implausible stories.</p><p>The recognition of plausible/implausible stories is the end task envisaged in this benchmark, which must be justified by the second-level and third-level steps. In Figure <ref type="figure" target="#fig_0">1</ref> we present a story pair from the GITA4CALAMITA dataset and the relation between the layers of annotation. Story A is a plausible story, Story B is the corresponding implausible story where the first and the second sentences are in conflict: Marco closes the refrigerator and cannot take the milk out of it. In the right part of the figure we can see the reasoning steps that the system must follow and resolve. This example is presented in English for clarity, but our entire dataset is in Italian.</p><p>We introduce a series of tasks that constitute a humaninterpretable reasoning process, supported by a chain of evidence, reflecting the assessment methodology outlined above. To explain this approach, we present the tasks from the deepest to the shallowest, mirroring human reasoning:</p><p>Physical state classification: Leveraging our physical state annotations, systems must recognize the involved physical states in the conflicting sentences of implausible stories. If we look at the example in 1, we are able to identify the problematic physical state "open" as cause of implausibility.</p><p>Conflict detection: Next, the task of conflict detection entails identifying sentence pairs of the form Si → Sj. Here, Sj represents the breakpoint, indicating the point at which the story becomes implausible based on the given context. Si serves as the evidence that explains the breakpoint, typically causing a conflicting world state.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Story classification:</head><p>The end task revolves around determining the plausibility of two stories. This determination is based on the conflicts detected within the two stories. By considering the presence of conflicts, the model can assess the viability and coherence of each story, facilitating the classification of the more plausible one.</p><p>By incorporating physical state classification, conflict detection, and story classification, we analyze the aspects of coherent reasoning, supported by evidence-driven analysis.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Data description</head><p>The GITA4CALAMITA dataset is composed by plausible and implausible stories. To compose the dataset, we focused on concrete actions that could be visualized in the physical world, avoiding mental actions such as "to think" or "to like". We created 5-sentence stories, giving context and requiring reasoning over multiple sentences. In all the stories, we avoided nonsensical sentences, in fact, each sentence is plausible alone, but could be implausible if associated with another specific sentence in an implausible story. With these characteristics, the task requires reasoning over the entire context.</p><p>An essential part of our evaluation process is constituted by the presence of physical state annotation. Systems must identify the underlying physical states that make a story not plausible in our physical world. During the creation of this dataset, we took into account 14 physical attributes that were included in the annotation phase, and we composed stories that contained those attributes. Following the work of <ref type="bibr" target="#b9">[9]</ref> and <ref type="bibr" target="#b10">[10]</ref>, these are the 14 physical states that we wanted to have in our stories:</p><p>• location, conscious, dressed, wet, exist, clean, power, functional, in pieces, open, temperature, solid, occupied, edible.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Dataset creation</head><p>In the first two rows of Table <ref type="table">1</ref> we can see an example of plausible story from the GITA4CALAMITA dataset</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Example of a plausible story, an implausible story from the Order dataset, and an implausible story from the Cloze dataset.</p><p>together with the English translation. In this example, the human actor is Marco, and the five sentences are ordered in the required way: the action of opening something, picking something up and using it. We can see that some of the previously listed physical states appear: Marco is conscious because he is doing something, the refrigerator is open because the actor can take something out of it, the cup is not occupied by anything and can be functional.</p><p>We aimed to minimize subjectivity and limit potential confounding factors from complex language usage. By using simple language, we were able to shift our focus away from linguistic processing and semantic phenomena, allowing us to concentrate more on examining machines' reasoning abilities, particularly their physical commonsense understanding. Consequently, we created our simple sentences in a straightforward declarative structure, typically starting with the agent of the story, followed by a verb, a direct object, and optionally, an indirect object.</p><p>Implausible stories are built upon the plausible ones, preserving the same actor and objects; in doing so we ensured that implausible variations remained coherent and believable, and we avoided nonsensical information. To create implausible stories, we implemented two different methods:</p><p>1. we switched the order of two sentences; 2. we substituted a plausible sentence with an implausible one.</p><p>These two methods resulted in two different partitions of our dataset: the Order dataset of implausible stories, and the Cloze dataset of implausible stories respectively.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.1.">Order implausible stories</head><p>The plausible stories only work in the causal sequence that we created. In the first row of Table 1, there is an example of a plausible story. In the third row, we see the corresponding implausible story for the order dataset, in which Marco, first, takes the milk out from the refrigerator and then open the refrigerator, generating a physically impossible situation: it is not possible to take something out of a closed refrigerator. By switching the first and the second sentences, we created an implausible story. In the entire dataset, we decided to generate implausible stories changing the order of only two sentences for story.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.">Cloze implausible stories</head><p>The second approach involves the substitution of a sentence from the plausible story with a new sentence. Although the new sentence itself is not inherently implausible, its placement within the sequence renders it implausible. In Table <ref type="table">1</ref>, the first sentence of the line F (Cloze), in the fifth row, was changed: Marco closes the refrigerator before taking out the milk. Again, the action is physically impossible: if the refrigerator is closed, nothing can be taken out from it.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Origin of data</head><p>GITA4CALAMITA is a new version of <ref type="bibr" target="#b8">[8]</ref>, which is based on <ref type="bibr" target="#b11">[11]</ref>. Our main objective was to create an Italian dataset, manually annotated, to assess a pre-trained language model on physical commonsense tiered tasks. To create the stories, we took inspiration from the Story Cloze Test <ref type="bibr" target="#b12">[12]</ref> and ROCStories Corpora <ref type="bibr" target="#b13">[13]</ref>. The Story Cloze Test compiles four-sentence stories with a missing ending so that a system chooses the most appropriate conclusion; the ROCStories Corpora is composed of fivesentence stories about everyday life for story generation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Annotation details</head><p>GITA4CALAMITA is annotated on three levels. In the first level, we annotated the plausibility/implausibility of a story with TRUE or FALSE. In the second level, in implausible stories we indicated between which sentences the conflict was, and in the third level we labelled the involved physical states in each sentence.</p><p>In the dataset, a plausible story is identified using a story number, while implausible stories are identified using the same story number as the plausible version, but with an additional C or O after the story number, where the letter C refers to the Cloze dataset, and the letter O refers to the Order dataset. Each story has been annotated using these elements: story id, worker id, actor of the story, objects of the story, physical states, sentences of the story, as well as number of sentences, and conflicting sentences, among others. The complete list and the specific meaning of each element are in Appendix A.</p><p>In each implausible story, we annotated the physical state that caused a conflict between two sentences. We annotated both Order and Cloze implausible stories according to the corresponding physical state involved. If we consider the stories in Table <ref type="table">1</ref>, both implausible stories (C and O) are annotated using the physical state "open", In fact, in both implausible stories the conflict is related to the openness of the refrigerator: in both cases the refrigerator appears closed when Marco tries to take the milk out of it. There are cases where for one plausible story there are two implausible stories that are implausible for two different reasons, hence the annotated physical state is different.</p><p>To ensure consistency and reduce human effort, we developed a custom environment and a Python script to streamline the annotation process. This semi-automated annotation process helped us process sentences from different story types, extract entities and actors, and organize them for manual annotation. The script provided a user-friendly terminal interface, and it is available in our repository. In terms of annotation efficiency, manually annotating one plausible story and two implausible ones typically took around 50 minutes. However, using our semi-automated annotation interface, we were able to complete the same task in approximately 20 minutes. Consequently, instead of the estimated 100 hours for annotating the entire dataset, we reduced the time to around 40 hours. Additionally, some annotations required review and occasional revisions, hence we estimated that the overall effort was of approximately 50-55 hours. An example of a complete annotation can be found in Appendix B.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4.">Data format</head><p>The GITA4CALAMITA dataset was created and annotated in a JSON format. The following example is story 0-C0 of our dataset, the first implausible Cloze story.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>{</head><p>"0 − C0 " : { " s t o r y _ i d " : 0 , " w o r k e r _ i d " : "GAP " , " t y p e " : " c l o z e " , " i d x " : 0 , " aug " : f a l s e , " a c t o r " : " Marco " , " l o c a t i o n " : " c u c i n a " , " o b j e c t s " : " f r i g o , l a t t e , t a z z a , c u c c h i a i o " , " s e n t e n c e s " : [ " Marco ha c h i u s o i l f r i g o . " , " Marco ha p r e s o i l l a t t e d a l f r i g o . " , " Marco ha p r e s o l a t a z z a . " , " Marco ha p r e s o i l c u c c h i a i o . " , " Marco ha messo i l c u c c h i a i o n e l l a t a z z a . " ] , " l e n g t h " : 5 , " e x a m p l e _ i d " : "0 − C0 " , " p l a u s i b l e " : f a l s e , " b r e a k p o i n t " : 1 , " c o n f l _ s e n t s " : [ 0 ] , " c o n f l _ p a i r s " : [ 0 , 1 ] } }</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.5.">Example of prompts used for zero or/and few shots</head><p>For each of the three proposed tasks we use a different prompt:</p><p>• Task 1: Please read the following story and answer if the story is plausible taking into account the order of the events. Please answer with true or false.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Task 2:</head><p>The following story is implausible. Identify the breakpoint, and then select the sentence responsible for the implausibility. Please identify the breakpoint sentence and the conflicting sentence.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Task 3:</head><p>The following story is implausible. Identify the physical state that causes the conflict in the story. These are the descriptions of each physical state: Power: Indicates whether an object is powered or not, relevant for electrical devices.</p><p>Location: Refers to the spatial position of an entity, either human or object. Exist: Denotes whether an object is present or has disappeared. Clean: Refers to the cleanliness of an entity, indicating whether it is clean or dirty. Edible: Identifies whether an object is fit for consumption. Wet: Denotes whether an object or person is in a wet or dry state. Functional: Refers to whether an object is in working condition or broken. Wearing: Applies to humans, indicating whether they are dressed or not. Open: Refers to whether an object (e.g., a door or container) is open or closed. Conscious: Denotes whether a human is conscious or unconscious. Temperature: Refers to the relative temperature of an entity, e.g., hot or cold. Solid: Describes whether an object is in a solid state. Occupied: Indicates whether an object (e.g., a container) is occupied or contains something. In pieces: Refers to whether an object is intact or has been broken into pieces. Select one of them after reading the story.</p><p>We select some examples from our GITA4CALAMITA dataset to be used as few-shot examples. For some of the tests we randomly select the examples, for others, we base our choice on their variability. We select stories where all possible combination of conflicting sentences were happening; at the same time, within the selected stories we try to include most of the physical states annotated.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.6.">Detailed data statistics</head><p>The GITA4CALAMITA dataset is an Italian test composed by a total of 356 stories. The statistics of the GITA4CALAMITA dataset are in Table <ref type="table" target="#tab_0">2</ref>. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Measures</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Metrics</head><p>The metrics involved in our tasks for the GITA4CALAMITA benchmark are the following ones:</p><p>• Accuracy assesses the traditional measure of end task accuracy, which quantifies the proportion of testing examples where plausible stories and implausible stories are accurately identified. • Consistency measures the proportion of testing examples where not only the implausible story is correctly identified, but also the conflicting sentence pair for the implausible story is accurately identified. The aim is to demonstrate the model's consistency in recognizing conflicts when reasoning about plausibility. • Verifiability evaluates the proportion of testing examples where not only the implausible story and the conflicting sentence pair for the implausible story are correctly identified, but also the underlying physical states that contribute to the conflict are accurately identified. This demonstrates that the detected conflict can be validated through a correct understanding of the underlying implausible change of physical states.</p><p>Taking into consideration the three different metrics, in Table <ref type="table" target="#tab_2">3</ref> we report the results in our test set. We perform experiments using the base and instruct Llama 3.1, Gemma 2 and Mistral models of various sizes. Each metric is obtained from a different task, where models are evaluated in the instances that are only guessed correctly in the previous tasks. All tasks are evaluated in a 3-shot setting, using random examples from the test set. For models that support system prompt (Llama3.1 models), the description of each task is included there, for models that do not support it (Gemma2 and Mistral models) the task description is included in the first user input. Each few-shot instance is formatted as a multiturn conversation between user and assistant. Next, we describe the main findings from these results.</p><p>Model Size and Performance: Generally, larger models (e.g., Llama-3.1 70B) outperform smaller models across the metrics. The 70B Llama-3.1 models show improvements over their 8B counterparts, particularly in consistency and verifiability. Gemma2 models also show improvements when bigger models are used. There are two exceptions in the case of the accuracy: Gemma2-Instruct 9B and Llama-3.1-Instruct 8B achieve better results than their bigger counterparts Gemma2 27B and Llama3 70B. They also outperform the base models.  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Limitations</head><p>This study has some limitations that should be acknowledged. Firstly, only one prompt was tested for each task, which may not fully capture the potential variability in performance. Additionally, the models used were multilingual but not specifically tailored for the Italian language, potentially affecting the accuracy of the results for Italian-specific tasks. Furthermore, the dataset used in this study was limited to stories within the household domain, which may not generalize well to other contexts.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Ethical issues</head><p>The dataset contains stories that may prototypically occur in Italian households. While most of these narratives are likely to be familiar to a broad audience, people from different cultural backgrounds may find some of the stories less frequent.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Representation of story pair from GITA</figDesc><graphic coords="2,89.29,85.19,432.56,71.02" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 2</head><label>2</label><figDesc>Statistics of GITA4CALAMITA</figDesc><table><row><cell></cell><cell>GITA4CALAMITA</cell></row><row><cell>plausible stories</cell><cell>117</cell></row><row><cell>implausible stories (ORDER)</cell><cell>122</cell></row><row><cell>implausible stories (CLOZE)</cell><cell>117</cell></row><row><cell>total stories</cell><cell>356</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>Results of the base and instruct Llama 3.1, Gemma 2 and Mistral models of various sizes Instruction Tuning Effects: Instruction-tuned versions (e.g., Gemma-2-Instruct, Llama-3.1-Instruct) typically outperform their base counterparts. There are exceptions such as order accuracy for LLama 3.1 70B and Gemma 2 9B. However, Mistral-V0.3-Instruct is very similar or worse than the base model and generally is more biased, it tends to classify as plausible the stories and it performs better in Cloze than in Order.Cloze, Order and Plausible Most models perform generally better on Cloze examples compared to Orderexamples. This is consistent across models and metrics. Models are generally better in Cloze and Order than in Plausible. This could be explained by the bias of the models to answer true or false when they are asked if the story is plausible. Models also see double implausible few-shot examples, which could also cause models to give that answer more frequently.</figDesc><table /></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This work has been partially funded by: • DeepR3 (TED2021-130295B-C31) funded by MCIN/AEI/10.13039/501100011033 and European Union NextGeneration EU/PRTR. • Disargue (TED2021-130810B-C21) MCIN/AEI/10.13039/501100011033 and European Union NextGenerationEU/PRTR. • DeepKnowledge (PID2021-127777OB-C21) MCIN/AEI/10.13039/501100011033 and by FEDER, EU. • Ixa group A type research group (IT1570-22) Basque Government • IKER-GAITU project 11:4711:23:410:23/0808 by Basque Government</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>A. Annotations in the dataset</head><p>These are the attributes that encode the metadata and linguistic information in the GITA dataset:</p><p>• story_id: refers to the number of the story for both plausible and implausible stories. • worker_id: refers to the name assigned to a specific worker during the creation of the story. • type: refers to cloze or order and it is a label used only in implausible stories. • idx: refers to the implausible dataset, where there is more than one implausible story for a given story number; for example, if we have more than one implausible version of a plausible story (we created more than an implausible story changing the order of our sentences more than once), the index number indicates to which implausible example we are referring. • aug: refers to possible automatic data augmentation techniques that can be taken into account for future works to resolve an overfitting problem. • actor: refers to the human agent of the story.</p><p>• location: refers to the room where the story takes place. • objects: refers to all the inanimate entities that we find into each story. • sentences: includes the 5 sentences in the story.</p><p>• length: refers to the number of sentences in each story. • example_id: corresponds to the story number and includes letters for implausible stories.</p><p>• plausible: is TRUE when the story is plausible and FALSE when it is implausible. • breakpoint: refers to the sentence where the story becomes implausible, where the conflict becomes evident; in plausible stories the breakpoint is always -1. • conlict_sents: refers to the other sentence in the story that together with the breakpoint sentence makes the story implausible; in plausible stories this field is blank. • conlict_pairs: refers to the conflict pair of sentences, gathering the two previous labels; in plausible stories this field is blank. • states: includes all the physical states annotations for all the stories. Marco ha p r e s o i l l a t t e .</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>B. Annotation environment</head><p>Marco ha p r e s o l a t a z z a .</p><p>Marco ha p r e s o i l c u c c h i a i o .</p><p>Marco ha messo i l c u c c h i a i o n e l l a t a z z a . l e n g t h : </p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Towards Reasoning in Large Language Models: A Survey</title>
		<author>
			<persName><forename type="first">J</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">C</forename></persName>
		</author>
		<author>
			<persName><forename type="first">.-C</forename><surname>Chang</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.findings-acl.67</idno>
		<ptr target="https://aclanthology.org/2023.findings-acl.67.doi:10.18653/v1/2023.findings-acl.67" />
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: ACL 2023, Association for Computational Linguistics</title>
				<meeting><address><addrLine>Toronto, Canada</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="1049" to="1065" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">WinoGrande: An Adversarial Winograd Schema Challenge at Scale</title>
		<author>
			<persName><forename type="first">K</forename><surname>Sakaguchi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">L</forename><surname>Bras</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Bhagavatula</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
		<idno type="DOI">10.1145/3474381</idno>
		<ptr target="https://doi.org/10.1145/3474381.doi:10.1145/3474381" />
	</analytic>
	<monogr>
		<title level="j">Commun. ACM</title>
		<imprint>
			<biblScope unit="volume">64</biblScope>
			<biblScope unit="page" from="99" to="106" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">A Review on Fairness in Machine Learning</title>
		<author>
			<persName><forename type="first">D</forename><surname>Pessach</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Shmueli</surname></persName>
		</author>
		<idno type="DOI">10.1145/3494672</idno>
		<ptr target="https://doi.org/10.1145/3494672.doi:10.1145/3494672" />
	</analytic>
	<monogr>
		<title level="j">ACM Comput. Surv</title>
		<imprint>
			<biblScope unit="volume">55</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Benchmarks for Automated Commonsense Reasoning: A Survey</title>
		<author>
			<persName><forename type="first">E</forename><surname>Davis</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ACM Comput. Surv</title>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<title/>
		<idno type="DOI">10.1145/3615355</idno>
		<ptr target="https://doi.org/10.1145/3615355.doi:10.1145/3615355" />
		<imprint/>
	</monogr>
	<note>just Accepted</note>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">How Can We Accelerate Progress Towards Human-like Linguistic Generalization?</title>
		<author>
			<persName><forename type="first">T</forename><surname>Linzen</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2020.acl-main.465</idno>
		<ptr target="https://aclanthology.org/2020.acl-main.465.doi:10.18653/v1/2020.acl-main.465" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<meeting>the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="5210" to="5217" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Climbing towards NLU: On Meaning, Form, and Understanding in the Age of Data</title>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">M</forename><surname>Bender</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Koller</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2020.acl-main.463</idno>
		<ptr target="https://aclanthology.org/2020.acl-main.463.doi:10.18653/v1/2020.acl-main.463" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<meeting>the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="5185" to="5198" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">CALAMITA: Challenge the Abilities of LAnguage Models in ITAlian</title>
		<author>
			<persName><forename type="first">G</forename><surname>Attanasio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Borazio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Croce</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Francis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gili</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Musacchio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Patti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rinaldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Scalena</surname></persName>
		</author>
		<ptr target="CEUR-WS.org" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)<address><addrLine>Pisa, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-12-06">December 4 -December 6, 2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">A Multilayered Approach to Physical Commonsense Understanding: Creation and Evaluation of an Italian Dataset</title>
		<author>
			<persName><forename type="first">G</forename><surname>Pensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Altuna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Gonzalez-Dios</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/2024.lrec-main.74" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
				<editor>
			<persName><forename type="first">N</forename><surname>Calzolari</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">M.-Y</forename><surname>Kan</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">V</forename><surname>Hoste</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Lenci</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Sakti</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Xue</surname></persName>
		</editor>
		<meeting>the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)<address><addrLine>Torino, Italia</addrLine></address></meeting>
		<imprint>
			<publisher>ELRA and ICCL</publisher>
			<date type="published" when="2024">2024</date>
			<biblScope unit="page" from="819" to="831" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Physical Causality of Action Verbs in Grounded Language Understanding</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Doering</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Chai</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/P16-1171</idno>
		<ptr target="https://aclanthology.org/P16-1171.doi:10.18653/v1/P16-1171" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics</title>
		<title level="s">Long Papers</title>
		<meeting>the 54th Annual Meeting of the Association for Computational Linguistics<address><addrLine>Berlin, Germany</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="1814" to="1824" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<title level="m" type="main">Simulating Action Dynamics with Neural Process Networks</title>
		<author>
			<persName><forename type="first">A</forename><surname>Bosselut</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Holtzman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ennis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Fox</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
		<idno>CoRR abs/1711.05313</idno>
		<ptr target="http://arxiv.org/abs/1711.05313.arXiv:1711.05313" />
		<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Tiered Reasoning for Intuitive Physics: Toward Verifiable Commonsense Language Understanding</title>
		<author>
			<persName><forename type="first">S</forename><surname>Storks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Chai</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2021.findings-emnlp.422</idno>
		<ptr target="https://aclanthology.org/2021.findings-emnlp.422.doi:10.18653/v1/2021.findings-emnlp.422" />
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: EMNLP 2021, Association for Computational Linguistics</title>
				<meeting><address><addrLine>Punta Cana, Dominican Republic</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="4902" to="4918" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">LSDSem 2017 Shared Task: The Story Cloze Test</title>
		<author>
			<persName><forename type="first">N</forename><surname>Mostafazadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Roth</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Louis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Chambers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Allen</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/W17-0906</idno>
		<ptr target="https://aclanthology.org/W17-0906.doi:10.18653/v1/W17-0906" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics, Association for Computational Linguistics</title>
				<meeting>the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics, Association for Computational Linguistics<address><addrLine>Valencia, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2017">2017</date>
			<biblScope unit="page" from="46" to="51" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">A Corpus and Cloze Evaluation for Deeper Understanding of Commonsense Stories</title>
		<author>
			<persName><forename type="first">N</forename><surname>Mostafazadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Chambers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Parikh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Batra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Vanderwende</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Kohli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Allen</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/N16-1098</idno>
		<ptr target="https://aclanthology.org/N16-1098.doi:10.18653/v1/N16-1098" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics</title>
				<meeting>the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computational Linguistics<address><addrLine>San Diego, California</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="839" to="849" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
