<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Overview of Touché 2024: Argumentation Systems⋆</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Extended Version</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Johannes Kiesel</string-name>
          <email>johannes.kiesel@uni-weimar.de</email>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Çağrı Çöltekin</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Maximilian Heinrich</string-name>
          <email>maximilian.heinrich@uni-weimar.de</email>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Maik Fröbe</string-name>
          <email>maik.froebe@uni-jena.de</email>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Milad Alshomary</string-name>
          <email>m.alshomary@ai.uni-hannover.de</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Bertrand De Longueville</string-name>
          <email>bertrand.de-longueville@ec.europa.eu</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tomaž Erjavec</string-name>
          <email>tomaz.erjavec@ijs.si</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nicolas Handke</string-name>
          <email>nicolas@bioinf.uni-leipzig.de</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Matyáš Kopp</string-name>
          <email>kopp@ufal.mf.cuni.cz</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nikola Ljubešić</string-name>
          <email>nikola.ljubesic@ijs.si</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Katja Meden</string-name>
          <email>katja.meden@ijs.si</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nailia Mirzhakhmedova</string-name>
          <email>nailia.mirzakhmedova@uni-weimar.de</email>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vaidas Morkevičius</string-name>
          <email>vaidas.morkevicius@ktu.lt</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Theresa Reitis-Münstermann</string-name>
          <email>theresa.reitis-munstermann@ext.ec.europa.eu</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Mario Scharfbillig</string-name>
          <email>mario.scharfbillig@ec.europa.eu</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nicolas Stefanovitch</string-name>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henning Wachsmuth</string-name>
          <email>h.wachsmuth@ai.uni-hannover.de</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Martin Potthast</string-name>
          <email>martin.potthast@uni-kassel.de</email>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Benno Stein</string-name>
          <email>benno.stein@uni-weimar.de</email>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Arcadia Sistemi Informativi Territoriali</institution>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Charles University</institution>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>European Commission, Joint Research Centre</institution>
          ,
          <addr-line>JRC</addr-line>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Jožef Stefan Institute</institution>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Kaunas University of Technology</institution>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Leibniz University Hannover</institution>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Leipzig University</institution>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>University of Kassel</institution>
          ,
          <addr-line>hessian.AI, and ScaDS.AI</addr-line>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>University of Tübingen</institution>
        </aff>
      </contrib-group>
      <abstract>
        <p>This paper is the extended overview of Touché: the fifth edition of the lab on argumentation systems that was held at CLEF 2024. With the goal to foster the development of support-technologies for decision-making and opinion-forming, we organized three shared tasks: (1) Human value detection (ValueEval), where participants detect (implicit) references to human values and their attainment in text; (2) Multilingual Ideology and Power Identification in Parliamentary Debates, where participants identify from a speech the political leaning of the speaker's party and whether it was governing at the time of the speech (new task); and (3) Image retrieval or generation in order to convey the premise of an argument with visually. In this paper, we describe these tasks, their setup, and participating approaches in detail.</p>
      </abstract>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        Decision-making and opinion-forming are everyday tasks, for which everybody has the chance to
acquire knowledge on the Web on almost every topic. However, conventional search engines are
primarily optimized for returning relevant results, which is insuficient for collecting and weighing the
pros and cons for a topic. To close this gap of technologies that support people in decision-making
and opinion-forming, the Touché lab’s shared tasks1 (https://touche.webis.de) call for the research
community to develop respective approaches. In 2024, we organized the three following shared tasks:
1. Human Value Detection (a continuation of ValueEval’23 @ SemEval [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ]) features two subtasks in
ethical argumentation of detecting human values in texts and their attainment, respectively.
2. Ideology and Power Identification in Parliamentary Debates features two subtasks in debate
analysis of detecting the ideology and position of power of the speaker’s party, respectively (new
task).
3. Image Retrieval/Generation for Arguments (third edition, now joint task with ImageCLEF) is
about the retrieval or generation of images to help convey an argument’s premise.
      </p>
      <p>In total, 20 teams participated in Touché in 2024. Nine teams participated in the human value
detection task (cf. Section 4)—of which six submitted a notebook paper—and submitted 21 runs. Most
teams integrated DeBERTa [3], RoBERTa [4], or the multi-lingual XLM-RoBERTa [5]. Only one team
employed a generative approach (employing GPT-4o). Nine teams participated in the multilingual
ideology and power identification task (cf. Section 5) and submitted 52 runs. The majority of teams
participated in both subtasks. While traditional machine learning methods like support vector classifiers
or logistic regression with n-gram features were more common among participating teams,
higherscores were typically obtained by teams using pretrained models. Two teams participated in the image
retrieval/generation for arguments task (cf. Section 6) and submitted 8 runs. Both teams used similarity
embeddings between images and text. One team used CLIP [6], the other a DPR [7] inspired approach.
The corpora, topics, and judgments created at Touché are freely available to the research community
on the lab’s website.2</p>
    </sec>
    <sec id="sec-2">
      <title>2. Related Work</title>
      <p>Argumentation systems are diverse and are connected to many fields within and outside of computer
science. The following sections review the related work for each Touché task of 2024.</p>
      <sec id="sec-2-1">
        <title>2.1. Human Value Detection</title>
        <p>Due to their outlined importance, human values have been studied both in the social sciences [8] and
in formal argumentation [9] for decades. According to the former, a “value is a (1) belief (2) pertaining
to desirable end states or modes of conduct, that (3) transcends specific situations, (4) guides selection
or evaluation of behavior, people, and events, and (5) is ordered by importance relative to other values
to form a system of value priorities.” For cross-cultural analysis, Schwartz derived 48 value questions
from universal individual and societal needs, including concepts such as obeying all the laws and being
humble [10]. Based on these taxonomies are several studies in the social sciences, which could greatly
benefit from the automated methods our task aims at [ 11]. See Scharfbillig et al. [12] for a recent
overview and practical insights from the social sciences.</p>
        <p>
          Moreover, several works in computer science utilize values. For example, in the context of
interactive systems, to tune interactive chat-based agents or texts in general towards morally acceptable
behavior [13, 14]. A related dataset is ValueNet [15], which contains 21K one-sentence descriptions
of social scenarios (taken from SOCIAL-CHEM-101 [16]) annotated for the 10 value categories of an
earlier version of Schwartz’ value taxonomy. A major diference to the Touché24-ValueEval dataset
are the more ordinary situations in ValueNet (e.g., whether to say “I miss mom”). Our earlier work
analyzed values in short arguments [
          <xref ref-type="bibr" rid="ref2">17, 2</xref>
          ].
1‘touché’ confirms “a hit in fencing or the success or appropriateness of an argument, an accusation, or a witty point.”
[https://merriam-webster.com/dictionary/touche]
2https://touche.webis.de/
        </p>
      </sec>
      <sec id="sec-2-2">
        <title>2.2. Ideology and Power Identification</title>
        <p>Parliamentary data has a high societal impact and provides publicly available sources for analyzing
(argumentative) language. Thus the number of resources based on parliamentary proceedings [18, 19],
and computational and linguistics analyses of parliamentary debates [20, 21] increased in recent years.</p>
        <p>The present task is about two important aspects of the political discourse, ideology and power.
Although a simplification, political orientation on the left-to-right spectrum has been one of the defining
properties of political ideology [22, 23]. Power is another factor that shapes the political discourse
[24, 25, 26]. Automatic identification of political orientation from texts has attracted considerable
interest [27, 28, 29, 30, 31], including a few recent shared tasks [32, 33]. The present task difers from the
earlier ones, with respect to the source material (parliamentary debates, rather than the popular sources
of social media or news) and multilinguality. Despite its central role in critical discourse analysis, to the
best of our knowledge, power in parliamentary debates has not been studied computationally. There
has been only a few recent computational studies providing indications of linguistic diferences between
governing and opposition parties [34, 35, 36, 37]. The present shared task and associated data is likely
to provide a reference for the future studies investigating power in political discourse.</p>
      </sec>
      <sec id="sec-2-3">
        <title>2.3. Image Retrieval/Generation for Arguments</title>
        <p>
          Images are a powerful tool for visual communication. They can provide contextual information and
express, underline, or popularize an opinion [38], thereby taking the form of subjective statements [39].
Some images express both a premise and a conclusion, making them full arguments [
          <xref ref-type="bibr" rid="ref3">40, 41</xref>
          ]. Other
images may provide contextual information only and have to be combined with a textual conclusion to
form a complete argument. In this regard, a recent SemEval task distinguished a total of 22 persuasion
techniques in memes alone [
          <xref ref-type="bibr" rid="ref4">42</xref>
          ]. Moreover, argument quality dimensions like acceptability, credibility,
emotional appeal, and suficiency [
          <xref ref-type="bibr" rid="ref5">43</xref>
          ] all apply to arguments that include images as well.
        </p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Lab Overview and Statistics</title>
      <p>For the fifth edition of the Touché lab, we received 68 registrations from 22 countries (vs. 41 registrations
in 2023). The most lab registrations came from India (24). Out of the 68 registered teams, 20 actively
participated in this year’s Touché edition (9, 9, and 2 teams submitting valid runs for Task 1, 2, and 3,
respectively). Active teams in previous editions were: 7 in 2023, 23 in 2022, 27 in 2021, and 17 in 2020.</p>
      <p>
        We used TIRA [
        <xref ref-type="bibr" rid="ref6">44</xref>
        ] as the submission platform for Touché 2024 through which participants could
either submit code, software, or run files. 3 Code and software submissions increase reproducibility, as
the software can later be executed on diferent data of the same format. To submit software, a team
implemented their approach in a Docker image that they then uploaded to their dedicated Docker
registry in TIRA. Software submissions in TIRA are immutable, and after the docker image had been
submitted, the teams specified the to-be-executed command—the same Docker image can thus be
used for multiple software submissions (e.g., by changing some parameters). A team could upload as
many Docker images or software submissions as they liked; only they and TIRA had access to their
dedicated Docker image registry (i.e., the images were not public while the shared task was ongoing).
To improve reproducibility, TIRA executes software in a sandbox by removing the internet connection
(ensuring that the software is fully installed in the Docker image which eases rerunning software later,
as libraries and models must be installed in an image). For the execution, participants could select
the resources that their software had available for execution, from 1 CPU core with 10 GB RAM up
to 5 CPU cores with 50 GB RAM and 1 Nvidia A100 GPU with 40 GB RAM. Participants could run
their software multiple times using diferent resources to study the scalability and reproducibility (e.g.,
whether the software executed on a GPU yields the same results as on a CPU). TIRA used a Kubernetes
cluster with 1,620 CPU cores, 25.4 TB RAM, 24 GeForce GTX 1080 GPUs, and 4 A100 GPUs to schedule
and execute the software submissions, to allocate the resources that the participants selected.
      </p>
      <sec id="sec-3-1">
        <title>Inner circle: 19 human values</title>
        <p>(see https://valueeval.webis.de)</p>
      </sec>
      <sec id="sec-3-2">
        <title>Outer circle: four motivational directions (not used in this task)</title>
        <p>• Openness to change</p>
      </sec>
      <sec id="sec-3-3">
        <title>Being independent and exploring</title>
        <p>• Self-enhancement</p>
      </sec>
      <sec id="sec-3-4">
        <title>Seeking pleasure, wealth, and esteem</title>
        <p>• Conservation</p>
      </sec>
      <sec id="sec-3-5">
        <title>Preserving group cohesion, order, and security</title>
        <p>• Self-transcendence</p>
      </sec>
      <sec id="sec-3-6">
        <title>Helping others, close ones, and nature</title>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. Task 1: Human Value Detection (ValueEval’24)</title>
      <p>The goal of this task is to develop approaches that allow for the large-scale analysis of human values
behind texts. In argumentation, one has to consider that people have diferent beliefs and priorities of
what is generally worth striving for (e.g., personal achievements vs. humility) and how to do so (e.g.,
being self-directed vs. respecting traditions), referred to as (human) values. By analyzing corpora of
texts, for example for news portals or political parties, one can develop an understanding of the values
that the authors deem the most important.</p>
      <sec id="sec-4-1">
        <title>4.1. Task Definition</title>
        <p>The task is to identify the values of the widely accepted value taxonomy of Schwartz [10] (cf. Figure 1)
and their attainment in long texts of nine languages (Bulgarian, Dutch, English, French, German, Greek,
Hebrew, Italian, and Turkish). This taxonomy has been replicated in over 200 samples in 80 countries
and is the backbone of value research [12]. A value can either be mentioned as something that is or
should be attained (i.e., lead towards fulfilling the value) or something that is constrained, i.e., not
attained. For example, for Security, (partial) attainment would mean that something is made safer
or healthier. In contrast, an event can be stated in a way that thwarts or constrains safety or health.
Participating teams can submit software in one or both of two subtasks: (1) Given a text, for each
sentence, detect which human values the sentence refers to; and (2) Given a text, for each sentence and
value this sentence refers to, detect whether this reference (partially) attains or constrains the value.</p>
      </sec>
      <sec id="sec-4-2">
        <title>4.2. Data Description</title>
        <p>
          The task employs a collection of 2648 human-annotated texts in nine languages from news articles
and political manifestos. Texts are sampled to reflect diverse opinions (diferent parties; mainstream
news and others) from 2019 to 2023. The data is annotated as part of the ValuesML project4 by
over 70 value scholars. The annotators marked segments in the texts, selected from 19 values the
values that the segment refers to most, and selected for each of these values whether the segment
(partially) attains or constrains the value, or whether attainment is unclear. Dedicated team leaders
per language trained the respective annotators, discussed sentences for which annotators disagreed in
their teams, and consolidated annotations into one ground truth. The team leaders discussed issues
with us in bi-weekly meetings. Moreover, we discussed with the team leaders the current holistic
inter-annotator agreement [
          <xref ref-type="bibr" rid="ref7">45</xref>
          ] and its change compared to the previous meeting to monitor annotation
4https://knowledge4policy.ec.europa.eu/projects-activities/valuesml-unravelling-expressed-values-media-informed-policymaking_en
leavyunA :iiltt-tfrceeSgoodunhh :iiilt-tfrcceeSaoodnn iilttSaounm iseodnHm itceeevnhAm :irceeaoodPnnwm :ssrrrceeeoouPw ceaF :iltsrrceeSaoypun :iilttsrcceeSaoyu iitraodnT :iltfsrreooyunCm :iilttfsrrreeaoooypnnnCm iiltyuHm :ilrcceeeeagvonnnB :iilltceeeeeeavboyddpnnnB :iilssrrcceeavonnnUm :iiltssrreeaavunnUm :iilltssrrceeeaavonnUm
Lang. Texts Sentences 
BG
DE
EL
EN
FR
HE
IT
NL
TR
All
quality and coherence across documents and languages. To measure annotator agreement, we computed
Krippendorf’s  before curation for all language teams individually and overall (cf. Table 1). We see
this agreement as suficient, and the curation process increased the annotation quality even further.
        </p>
        <p>
          For Touché, the dataset is automatically split into sentences using Trankit version 1.1.1 [
          <xref ref-type="bibr" rid="ref8">46</xref>
          ]. Table 2
shows the dataset format. The dataset is provided both in the original language and automatically
translated to English, either using DeepL or, for Hebrew, Google Translate.5 The dataset is split into
sets by texts, so that 60% / 20% / 20% of sentences are in the training / validation / test set, respectively.6
        </p>
        <p>Table 1 shows the size and value distribution for each language. The number of texts per language
are between 219 (French) and 408 (English). The number of sentences per language are between
4 650 (French) and 11 133 (Turkish). Only 30.4% of the French sentences are annotated as referring to a
value, but 85.9% of Hebrew sentences. The value frequency is between 0.2% (Humility) and 8.6% (Security:
societal). This in-balance between languages and values makes the problem especially challenging.</p>
      </sec>
      <sec id="sec-4-3">
        <title>4.3. Participant Approaches</title>
        <p>In 2024, nine teams participated in this task (of which six submitted a notebook paper) and submitted
21 runs. Moreover, we added two baseline runs for comparison. Five of the six teams that submitted a
paper relied on DeBERTa [3], RoBERTa [4], or the multi-lingual XLM-RoBERTa [5]. The other team (Eric
Fromm) used GPT-4o.7 Two teams work with the multi-lingual dataset (Arthur Schopenhauer, Hierocles
of Alexandria) whereas the others use the English translations only. Only one team (Hierocles of
Alexandria) used the sentence sequence, whereas the other teams classified each sentence individually.</p>
        <sec id="sec-4-3-1">
          <title>Who designed global guidelines for puberty blockers?</title>
        </sec>
        <sec id="sec-4-3-2">
          <title>More and more children and young people believe they have to question their . . .</title>
        </sec>
        <sec id="sec-4-3-3">
          <title>Some 60 minors were treated in the Netherlands in 2010, but has increased to . . .</title>
          <p>Self-direction: thought attained</p>
          <p>Self-direction: thought constrained . . .</p>
          <p>
            Baselines. We provide two baselines, that also served to kickstart the participants’ approaches:8 (1) a
random baseline assigns per sentence a uniformly random value “confidence” to each value in subtask 1
and randomly distributes this confidence between attained and constrained for subtask 2; and (2) a
BERT [
            <xref ref-type="bibr" rid="ref9">47</xref>
            ] baseline trained for multi-label classification for all 38 combinations of value and attainment.
Team Arthur Schopenhauer [
            <xref ref-type="bibr" rid="ref10">48</xref>
            ].9 The team used the multi-lingual dataset and analyzed the
sentences independently. They approached subtask 1 as a classification problem. A no-label class
was added for sentences without assigned value, and sentences with Humility were ignored due to
the scarcity of that value. The 6% of sentences with more than one assigned value were ignored, as
well. Diferent models were fine-tuned for English texts (deberta-v2-xxlarge [ 3]) and others
(xlmroberta-large [5]). In both cases, an ensemble with a thresholded soft voting scheme of four models
was employed: one model for each combination of two seeds and two loss functions. For loss functions
the authors report that cross entropy lead to higher results in their preliminary tests for frequent
values but weighted cross entropy did so for infrequent values. The team approached subtask 2 as a
binary classification problem, ignoring the few sentences with unknown attainment. Their approach
is otherwise the same as for subtask 1, except that only a single model was employed instead of an
ensemble (with cross entropy loss) based on results from their preliminary tests.
          </p>
          <p>
            Team Edward Said [
            <xref ref-type="bibr" rid="ref11">49</xref>
            ]. The team used the English translations of the dataset and analyzed the
sentences independently. To counter the label imbalance, the team upsampled sentences by a factor of
four if the associated label is one of 14 underrepresented labels (value + attainment; out of 38). They
selected the 14 labels that are infrequent in total or in comparison to the label for the same value with
other attainment. They fine-tuned a RoBERTa [ 4] and DeBERTa [3] model for multi-label classification.
Team Eric Fromm [
            <xref ref-type="bibr" rid="ref12">50</xref>
            ]. The team used the English translations of the dataset and analyzed the
sentences independently. They employed GPT-4o for zero-shot classification, prompting with the
annotator guide’s 19 value descriptions to select at most one per sentence. They did not tackle subtask 2.
8https://github.com/touche-webis-de/touche-code/tree/main/clef24/human-value-detection/approaches
9Code: https://github.com/h-uns/clef2024-human-value-detection
Models: https://huggingface.co/h-uns
Image: docker pull webis/valueeval24-arthur-schopenhauer-ensemble:1.0.0
Team Hierocles of Alexandria [
            <xref ref-type="bibr" rid="ref13">51</xref>
            ].10 The team used both the multi-lingual dataset and English
translations and incorporated sentence sequence information. More specifically, their approach predicts
values for a sentence from an input text that consists of the previous two sentences concatenated
with the target sentence. The two preceding sentences contained special tokens to represent any
values assigned to them. During training and validation the true labels were employed, but during
testing the predicted labels of the previous sentences were leveraged. The team fine-tuned diferent
RoBERTa [4] and DeBERTa [3] models for English and XLM-RoBERTa [5] models for the multi-lingual
dataset, with the best performing one being XLM-RoBERTa-xl [
            <xref ref-type="bibr" rid="ref14">52</xref>
            ]. Moreover, they developed a custom
model architecture for multi-label text classification consisting of multiple classification heads. Each
classification head focused on a diferent language for the multi-lingual dataset. The custom model
architecture was adapted and employed for the English-translated dataset as well. After preliminary
experiments concerning loss functions, class weights and various thresholds, they used the binary
cross-entropy loss with logits as their loss function and selected an optimal classification threshold for
each value. The approach is trained to tackle both subtasks 1 and 2.
          </p>
          <p>
            Team Philo of Alexandria [
            <xref ref-type="bibr" rid="ref15">53</xref>
            ].11 The team used the English translations of the dataset and analyzed
the sentences independently. They approached subtask 1 as a multi-label problem and fine-tuned
DeBERTa (deberta-base [3]) after initial experiments with several models. They employ the same base
model for subtask 2 and fine-tune it to classify each text pair of sentence and human value name into
either attaining or constraining.
          </p>
        </sec>
      </sec>
      <sec id="sec-4-4">
        <title>Team SCaLAR NITK (code name: Peter Abelard) [54]. The team used the English translations</title>
        <p>of the dataset and analyzed the sentences independently. They experimented with SVMs, KNNs,
decision trees, hierarchical classification, transformer models and large language models. Based on
preliminary experiments, they fine-tuned a RoBERTa [ 4] model for both subtasks (multi-label and
binary classification, respectively).</p>
      </sec>
      <sec id="sec-4-5">
        <title>4.4. Task Evaluation</title>
        <p>
          Following ValueEval’23 [
          <xref ref-type="bibr" rid="ref2">2</xref>
          ], submissions are evaluated using standard macro F1-score over all values.
The same metric is used for the new subtask 2. The submission format allowed participants to submit
only one run file for both subtasks (same format as the labels.tsv), but the scores for the subtasks
are calculated independently of each other from the same file as follows. Each submission includes for
each sentence and value a confidence score (between 0 and 1) for both attained and constrained polarity.
If the sum of the two numbers is above 0.5, the submission is evaluated as having predicted that the
sentence refers to that value (subtask 1). For subtask 2, only the sentence-value pairs are considered for
which the sentence refers to the value according to the ground-truth. For these pairs, the submission is
evaluated as having predicted the attainment polarity for which it produced the larger confidence score.
        </p>
        <p>Table 3 shows the results for the best-performing approaches per team for both subtasks. The
best-performing approach for subtask 1 is the one of team Hierocles of Alexandria that uses
XLMRoBERTa-xl, the previous sentences, and is trained specifically for subtask 1. Overall, multilingual
models performed best, with also the second-in-place employing such a model. Rarer values are overall
detected worse, with the exception of the zero-shot approach by team Eric Fromm (especially Humility),
indicating insuficient training data. Several teams achieved top scores for subtask 2. Overall, this
binary classification task is, as once can expect, much easier than subtask 1. However, most teams
clearly focused their eforts on subtask 1, so there is likely more room for improvement.
10Code: https://github.com/SotirisLegkas/Touche-ValueEval24-Hierocles-of-Alexandria</p>
        <p>Image: docker pull webis/valueeval24-hierocles-of-alexandria:1.0.0
11Code: https://github.com/VictorMYeste/touche-human-value-detection</p>
        <p>Models: https://huggingface.co/VictorYeste/deberta-based-human-value-detection</p>
        <p>https://huggingface.co/VictorYeste/deberta-based-human-value-stance-detection</p>
        <p>
          Image: docker pull victoryeste/valueeval24-philo-of-alexandria-deberta-cascading
llra :ii-tttrcegoodunhh :iii-ttrcceaoonn iltaon is tn iceannm ssrceou :iltsrreaoypn :iilttsrceaoy iton :iltsrreoyum :iilttfsrrreeaooypnnm iilty :ilrcceeagvonn :iilltceeeeavboyddpnnn :iilssrrcceeavonnm :iiltssrreeaavunm :iilltssrrceeeaavonm
evO lfeS lfeSd itSum eodnHm iceeevhAm :reoodPw :rreeoPw ceaF ceSu ceSu iradT fonC onC uHm eenB eeB nU nU nU
To get a visual impression of the performance of the submissions, the radar plot in Figure 2 shows
the F1-score of each submission for each value as lines. As the plot shows, almost all submissions have
improved for all values compared to the random baseline (orange). However, all lines lie within the
black dashed boundary of the maximum F1-scores achieved by last year’s submissions on last year’s
dataset [
          <xref ref-type="bibr" rid="ref17 ref2">55, 2</xref>
          ]. This shows that the dificulty of the prediction task has increased compared to last year,
        </p>
        <p>tolerance
H
u
m
ility
c
B ir
a
:
e
c
n
e
ol
v
e
ne gn</p>
        <p>Sel f-direction:
thought
1.00</p>
        <p>action
mainly due to the much rarer values. The diference between the datasets of the two years can also
be seen in the line of the Adam Smith classifier (purple) in comparison to that of the BERT baseline
(teal): Adam Smith performs worse than the specially trained BERT baseline (also: overall F1-score 0.20
vs. 0.24) since it was not retrained for the new dataset, even though in ValueEval’23 it significantly
improved over the BERT baseline (0.56 vs. 0.42).</p>
        <p>If one compares last year’s hull of submission lines (black dashed line, “ValueEval’23 max”) with
this year’s equivalent hull, one sees that some values in this year’s data set are particularly dificult to
predict. The visual spread between these hull lines is particularly large for the values Self-direction:
thought, Self-direction: action, Security: personal, Conformity: interpersonal and Humility. A likely
explanation for this is that these values are expressed very diferently in the underlying source data.
We therefore conclude that value detectors are not yet robust across all text genres and that further
data sets in diferent genres are needed to achieve this goal.</p>
        <p>in
te
rpersonal
persona
l
The study of parliamentary debates is crucial to understand the decision processes in the parliaments
and their societal impacts. The goal of this task is to automatically identify two important aspects of
parliamentary debates: the political orientation of the party of the speaker, and the role of the party of
the speaker in the governance of the country or the region. Identifying these underlying aspects of
parliamentary debates enables automated comprehension of these discussions, the decisions that these
discussions lead to, and their consequences.</p>
      </sec>
      <sec id="sec-4-6">
        <title>5.1. Task Definition</title>
        <p>Both subtasks were defined as binary classification tasks: Given a parliamentary speech, (1) predict the
political orientation of the party of the speaker on the left–right spectrum, and (2) predict whether the
speaker belongs to one of the governing parties or the opposition. The first task is relatively well studied,
and there have been some recent shared tasks on identifying political orientation [32, 33]. Unlike the
earlier tasks, our data set includes multiple parliaments and languages, and is based on parliamentary
debates. To the best of our knowledge, automatic identification of governing role—power—has not been
studied earlier.</p>
      </sec>
      <sec id="sec-4-7">
        <title>5.2. Data Description</title>
        <p>
          The source of the data for this task is the ParlaMint [
          <xref ref-type="bibr" rid="ref18">56</xref>
          ], a uniformly encoded and annotated corpus of
transcripts of parliamentary speeches from multiple national and regional parliaments.12 The transcripts
are The ParlaMint version 4.0 used for the task includes data from the following national and regional
parliaments: Austria (AT), Bosnia and Herzegovina (BA), Belgium (BE), Bulgaria (BG), Czechia (CZ),
Denmark (DK), Estonia (EE), Spain (ES), Catalonia (ES-CT), Galicia (ES-GA), Basque Country (ES-PV),
Finland (FI), France (FR), Great Britain (GB), Greece (GR), Croatia (HR), Hungary (HU), Iceland (IS),
Italy (IT), Latvia (LV), The Netherlands (NL), Norway (NO), Poland (PL), Portugal (PT), Serbia (RS),
Sweden (SE), Slovenia (SI), Turkey (TR) and Ukraine (UA). The labels for both subtasks are also coded in
the ParlaMint corpora. For the sake of simplicity, we formulate both tasks as binary classification tasks.
For both tasks, the main challenge in the creation of a dataset is to minimize the efects of covariates.
Even though the instances to classify are speeches, the annotations are based on the party membership
of the speaker. As a result, underlying variables like party membership, or speaker identity perfectly
covary with ideology and power in most cases.
        </p>
        <p>As a trade-of between data size, and for reducing the efect of covariates, we opt for a speaker-based
sampling. First, to discourage, to some extent, the classifiers from relying on author identification, we
sample at most 20 speeches of a single speaker. This is also important for introducing variation into
the dataset, as the number of speeches from each speaker follows a power-law distribution: While a
small number of speakers tend to deliver most of the speeches, e.g., party or party group leaders, most
speakers have relatively few speeches. The distribution of speeches or speakers to include in training
and test sets is also important for proper evaluation. For the ideology task, the set of speakers in the
training and test sets are disjoint. The ideal dataset split for the power identification task requires a
diferent constraint: training and test sets should include speeches from the same speaker with diferent
power roles. To come as close as possible to this ideal split, we opt for a best-efort training–test split.
When possible, we make sure that the speakers in the test set are also available in the training set with
the opposite power role. Otherwise, we randomly sample more speakers to obtain the test set.</p>
        <p>
          For evaluation, we set the test set size to 2 000 instances for both subtasks (100 to 200 speakers
depending on the individual corpus and the task). Despite multiple speeches from each speaker, due to
12Although all transcripts are obtained thorough the data published by the respective parliaments, the method for obtaining
the transcripts vary, such as scraping the web site of the parliament, extracting from published PDF files, and obtaining
through an API provided by the parliament. For details, we refer to [
          <xref ref-type="bibr" rid="ref18">56</xref>
          ].
Left
Opposition
        </p>
        <p>Right
fForigbo.t2h.suObvtaesrkvsifeowreaocfhtphaerliTamoeuncth.Tée2s4t siedtesoizleosgayreaanppdropxoimwaeterlyid2e0n00tisficpaeteicohnesdfoartaalslepta.rliTamheentbsa.rs
show the training set for both subtasks for each parliament. Test set sizes are
approximately 2 000 speeches for all parliaments.
missing annotations and the lack of diversity of orientation in some parliaments, the disjoint speakers
constraint mentioned above results in a small number of instances in the training set for some of
the parliaments. Not all parliamentary data provides both labels. Some countries do not have the</p>
        <p>Both data sets exhibit a mild class and text length imbalance between
paropposition–governing party distinction, and for the Galician parliament, the number and distribution
liaments. The data set’s size was a technical challenge for some participants.
of orientation labels did not result in a test set that was large enough. Figure 3 shows the training set
sTizhees faorveearcahgpeartleiaxmtelnetn.Tghtheteisst aseptpsrizoexfiomraaltleplayrli6am00enstspiascaep-psreopxaimraatteelyd2t0o00kespnese,chwehs.icWhe diso
sMepoarreatoevpeurb,litchateiodna[t5a7].s13et is also large overall (more than 3GB uncompressed).</p>
        <p>In addition ot the original speech transcripts and labels, we also provide automatic English translations,
an anonymized speaker ID and the speaker’s sex in the data for both tasks. Except the speaker ID,
w5.h3ich isPnaorttinicthipetaensttseAts.pproaches</p>
        <p>Both data sets exhibit a mild class and text length imbalance between parliaments. The data set’s size
In 2024, 9 teams participated in this task and submitted 52 runs. We added a
was a technical challenge for some participants. The average text length is approximately 600
spacemmooddelesl.sMworeeroevetrh,teheddoamtaisneatnistalcsolalsasrgifieerosv,efroalrl (tmhoirse tthaasnk3mGBanunycopmaprrteiscsiepda).nts preferred
traditional, ‘computationally light’ approaches. A possible reason may be the
5la.3rg.ePaterxtitcispiazentwAhpipchroiascmheosre costly to process with larger systems. Most teams,
even the teams that used language models with large context sizes, truncated
In 2024, 9 teams participated in this task and submitted 52 runs. We added a baseline for comparison.
UthnelikteetxhtesVtaolueaElvleavltiaastke, wchoemreppurteatrtaiionnedallanregquaugieremmodeenltssw. eSreomtheedoofmtinhaenticnltaessriefiesrst,infogr
tihmistparsokvmeamneynptasrtiicnipcaluntdsepreefnesrreemd btrlaediotifoncalla,‘scsoimfierpsu,tadtiaotnaallya uligghmt’eanptpartoiaocnhest.hAropuogsshiblbearecaks-on
mtraaynbselathtieolnargaentdextsysinzeonwyhmich risepmloarceecmosetlnytt,o mpruoclteis-stwasitkh llaeragrenrisnygst,emads.dMitoisotnteaalmfse,aetvuenretsh,e
teams that used language models with large context sizes, truncated the texts to alleviate computational
such as sentiment scores, and the use of domain-specific models.
requirements. Some of the interesting improvements include ensemble of classifiers, data augmentation</p>
        <p>Baselines. We provided only a single logistic regression baseline with tf-idf
13Training and test data are available at https://zenodo.org/doi/10.5281/zenodo.10450640, and https://zenodo.org/doi/10.5281/
weighted character n-grams. The baseline is intentionally kept simple to
encourage participation by early researchers, and reduce the computation requirements.
through back-translation and synonym replacement, multi-task learning, additional features, such as
sentiment scores, and the use of domain-specific models.</p>
        <p>Baselines. We provided only a single logistic regression baseline with tf-idf weighted character
n-grams. The baseline is intentionally kept simple to encourage participation by early researchers, and
reduce the computation requirements.</p>
        <p>
          Team Policy Parsing Panthers [
          <xref ref-type="bibr" rid="ref20">58</xref>
          ]. The team did a set of experiments with original transcripts
and their English translations, using various deep pretrained models, including BERT [
          <xref ref-type="bibr" rid="ref9">47</xref>
          ], mBERT
[
          <xref ref-type="bibr" rid="ref9">47</xref>
          ], RoBERTa [4], XLM-RoBERTa [5], DeBERTa-v3 [3] Gemma [
          <xref ref-type="bibr" rid="ref21">59</xref>
          ] and ensembles of these models.
This team presents an extensive set of approaches, and their analyses. A few interesting approaches
worth mentioning in this short summary includes (1) Data augmentation and balancing through
backtranslation, (2) experiments with additional metadata, (3) multi-task learning, (4) the use of automatically
obtained polarity labels, and increasing the number of instances in the training set of the orientation
subtask by using the matching speaker IDs in the power dataset. This team participated in both subtasks
for all parliaments.
        </p>
        <p>
          Team Trojan Horses [
          <xref ref-type="bibr" rid="ref22">60</xref>
          ]. The team experimented with improving the logistic regression baseline,
as well as fine-tuning BERT. They used the English translations and participated in both subtasks for
the majority of the parliaments.
        </p>
        <p>
          Team Pixel Phantoms [
          <xref ref-type="bibr" rid="ref23">61</xref>
          ]. The team experimented with some of the traditional classifiers (SVMs,
logistic regression and decision trees) using the English translations provided. As well as tf-idf weighted
features, they also extracted text embeddings from DistilBERT [
          <xref ref-type="bibr" rid="ref24">62</xref>
          ], through Sentence BERT [
          <xref ref-type="bibr" rid="ref25">63</xref>
          ]. They
participated in both subtasks for the majority of the parliaments.
        </p>
        <p>
          Team Ssnites [
          <xref ref-type="bibr" rid="ref26">64</xref>
          ]. The team fine-tuned BERT for the majority of parliaments and both subtasks.
They relied on the English translations provided, and participated in both subtasks for the majority of
the parliaments.
        </p>
        <p>
          Team Hale Lab [
          <xref ref-type="bibr" rid="ref27">65</xref>
          ]. After some initial experiments with BERT, the team used a variety of
classification methods including simple feed-forward networks, and LSTMs. The features for the models were
either bag-of-words features weighted with tf-idf, or the multilingual LASER [
          <xref ref-type="bibr" rid="ref28">66</xref>
          ] embeddings. They
used the original (untranslated) data, using various libraries for tokenization and preprocessing, and
participated in both subtasks for the majority of the parliaments.
        </p>
        <p>
          Team Vayam Solve Kurmaha [
          <xref ref-type="bibr" rid="ref29">67</xref>
          ]. This team also experimented with multiple traditional
classification methods (SVM, kNN, random forests) and their ensembles, using the English translations. The
team also used data augmentation through synonym replacement. They participated in both subtasks
for the majority of the parliaments.
        </p>
        <p>
          Team Gerber [
          <xref ref-type="bibr" rid="ref30">68</xref>
          ]. The team used a convolutional neural network (CNN) for the task without any
pretrained embeddings. They used the original transcripts only, and participated in both subtasks for
the majority of the parliaments.
        </p>
        <p>
          Team JU_NLP_DID [
          <xref ref-type="bibr" rid="ref31">69</xref>
          ]. The team used SVM classifiers with tf-idf features, participating in both
subtasks for the majority of the parliaments. They also make use of automatic sentiment labels as an
additional feature.
Team INSA Passau [
          <xref ref-type="bibr" rid="ref32">70</xref>
          ]. The team also experimented with multiple approaches, where some of
their submissions were focused on orientation identification and a smaller number of parliaments. The
methods used included training SVMs, fine-tuning BERT-based models (pre)trained on legal documents
[
          <xref ref-type="bibr" rid="ref33 ref34">71, 72</xref>
          ] and finetuning and zero- and few-shot prompting the Llama [
          <xref ref-type="bibr" rid="ref35">73</xref>
          ] version 3 models with varying
sizes (which were released during while the shared task was running).
        </p>
      </sec>
      <sec id="sec-4-8">
        <title>5.4. Task Evaluation</title>
        <p>We use macro-averaged F1-score as the main evaluation metric for both subtasks. Similar to the
ValueEval task, the participants were encouraged to submit confidence scores, where a score over 0.5 is
interpreted as class 1 and otherwise 0.</p>
        <p>Table 4 and Table 5 present the overall best-performing approaches per team for the ideology
and power subtasks respectively. The best scores for both tasks are from the team Policy Parsing
Panthers. The team used an ensemble of multiple models, with multiple improvements including data
augmentation and multitask learning. Results on the tables do not include approaches that were focused
on only one or a small number of parliaments. A noteworthy focused submission for only GB and
ideology subtask by the team INSA Passau based on fine-tuning the most recent Llama 3 model achieved
the second-best result for this parliament. Although the results on both tasks are higher than the
baseline we provided, the variation in the scores indicate that there is quite some room for improvement
for each of the approaches.</p>
        <p>As the results show, as formulated in this task, identifying orientation is slightly more dificult
than identifying power. The overall success of the systems on a particular parliament depends on,
among others, size and class distribution of the training data, and composition of the parliament. For
example, there is a general trend (with some exceptions) that for parliaments with few or no government
and opposition role changes in the data (e.g., HU, PL, and TR) the roles are easier to predict than for
parliaments with more varied composition and more role changes( e.g., AT, BA, and UA).</p>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>6. Task 3: Image Retrieval/Generation for Arguments ( joint task with</title>
    </sec>
    <sec id="sec-6">
      <title>ImageCLEF)</title>
      <p>
        Images provide powerful visual communication, are usually perceived before text is read, and can appeal
directly to our emotions. The goal of this task is to find images that convey premises. The proper use
of an image can increase the persuasiveness of an argument. In this regard, images can increase the
pathos [
        <xref ref-type="bibr" rid="ref36">74</xref>
        ], which is the efect an argument has on its audience.
      </p>
      <sec id="sec-6-1">
        <title>6.1. Task Definition</title>
        <p>
          This observation leads to our task, in which participants are asked to find images based on an argument
that help to convey the premise of the argument. In this context, “convey” is meant in broad terms; it
can represent what is described in the argument, but it can also show a generalization (e.g., a symbolic
image that illustrates a related abstract concept) or a specialization (e.g., a concrete example). There is a
diference between verbal language and images. Verbal language provides clear but limited information,
while images provide more information than written words, but are not as precise [
          <xref ref-type="bibr" rid="ref37">75</xref>
          ]. Therefore,
images alone can be ambiguous and dificult to understand without context, e.g. when they refer to
symbolism. For this reason, we ofer the option of submitting a rationale together with the image. The
rationale is an explanatory statement that assists in understanding the picture. For example, it can be a
caption or contextual information about the image. The image and the rationale are evaluated together
to see how this combination conveys the premise. Participants can choose to use a retrieval approach,
where they submit images from a provided dataset, or a generation-based approach, where suitable
images can be generated using a model of their choice. In each submission, a participant can submit up
to 10 images in a ranking order for an argument.
        </p>
      </sec>
      <sec id="sec-6-2">
        <title>6.2. Data Description</title>
        <p>
          For the task we prepared a dataset14 containing 136 arguments and over 9000 images. The arguments
were generated with GPT-4 [
          <xref ref-type="bibr" rid="ref38">76</xref>
          ] and correspond to 24 topics. The topics were taken from various IBM
datasets 15 and previous Touché Shared Tasks16. Each generated argument consists of a premise and a
claim, and can take a pro or con stance on the topic. An example of an argument can be seen in Fig. 4.
Each of the images in the dataset is tagged with additional information, such as the URL and content of
the corresponding website. In addition, we have provided an analysis of each image using the Google
14https://zenodo.org/records/11045831
15https://research.ibm.com/haifa/dept/vst/debating_data.shtml
16https://touche.webis.de/shared-tasks.html
        </p>
        <sec id="sec-6-2-1">
          <title>Premise: The idea of winning through intentional infliction</title>
          <p>of pain and harm to another person can nurture a violent
and destructive mentality.</p>
        </sec>
        <sec id="sec-6-2-2">
          <title>Claim: Boxing poses both physical and psychological threats</title>
          <p>to participants, hence it should be banned.</p>
          <p>
            Rationale: The image captures a boxing match in progress,
with two men standing in the ring. One of them is wearing
a red glove and appears to be getting hit by his opponent’s
punch. The other man is also wearing a red glove, likely as
part of his attire for the match. The boxers are focused on
their performance, with one of them holding his mouth open
while taking a blow from his opponent. The scene showcases
the intensity and determination of these athletes during the
competition.
Cloud Vision API, as well as an automatically generated caption using LLaVA [
            <xref ref-type="bibr" rid="ref39">77</xref>
            ]. An example of a
submission can be seen in Figure 5.
          </p>
        </sec>
      </sec>
      <sec id="sec-6-3">
        <title>6.3. Participant Approaches</title>
        <p>In 2024, 2 teams participated in this task and submitted 8 runs. All teams chose the retrieval-approach.
Moreover, we added 2 baseline runs for comparison.</p>
        <p>
          Baselines The first baseline is BM25, where the corresponding documents are the image captions
from the data set and the query is the premise of the argument. In the second baseline, keywords are
ifrst extracted from the image captions. Then embeddings for the premise of an argument and the
keywords are generated with SBERT [
          <xref ref-type="bibr" rid="ref25">63</xref>
          ]. A corresponding relevance score is calculated based on the
cosine similarity between the embeddings and averaging them. The most relevant images are selected
for submission.
        </p>
        <p>
          DS@GT [
          <xref ref-type="bibr" rid="ref40">78</xref>
          ]. The team uses CLIP [6] to embed each argument and each image in a common
embedding space. The first approach ranks images by cosine similarity of the embeddings. The second
approach compares for each argument the 40 highest ranked images to images that are generated to
support or attack the argument. The most similar images are submitted. For image generation, Stable
Difusion v2-1 [
          <xref ref-type="bibr" rid="ref41">79</xref>
          ] was used.
        </p>
        <p>
          HTW-DIL [
          <xref ref-type="bibr" rid="ref42">80</xref>
          ]. The team has chosen an approach inspired by DPR [7]. It applies a fine-tuned
multimodal Moondream model based on the Phi 1.5 LLM [
          <xref ref-type="bibr" rid="ref43">81</xref>
          ] and uses SigLIP [
          <xref ref-type="bibr" rid="ref44">82</xref>
          ] for its vision
capabilities. To generate synthetic training data, the team uses GPT-4 to generate arguments from the
available image/web page data. Combinations of positive and negative argument-image pairs are used
for training. The results are obtained by maximising the cosine similarity for argument and image
embeddings. To enable comparability, the team also adopted a standard approach of embedding the
corresponding website content of the images and each argument using OpenAI’s Ada model17 and
selecting the most similar pairs.
        </p>
      </sec>
      <sec id="sec-6-4">
        <title>6.4. Task Evaluation</title>
        <p>For each argument and each submission, the best 5 images together with the rationales are evaluated
by a human expert. This expert knows neither the rank of the image nor the team that submitted it. To
facilitate the annotation, we prepared a narrative for each argument that describes what a conveying
image should generally show. Therefore, each combination of image, argument and rationale is rated
on a three-point Likert scale from 0 to 2, where 0 means that the image does not convey the premise at
all, 1 stands for partial conveyance and 2 means that the image conveys the premise completely. A total
of 5,061 image, argument and rationale triples were annotated. For seven topics, only very few relevant
images could be submitted by the participating teams, so we removed these topics, resulting in a total
number of 104 arguments for the final evaluation. For each submission, we first calculated the NDCG
score for each argument. For the required IDCG, we have considered all submitted image, argument
and rationale triples submitted for the corresponding argument. The final score of a submission is the
average of all NDCG scores for all arguments.</p>
        <p>Table 6 shows the results for both teams and baselines. For all three NDCG measures, team HTW-DIL
achieved the highest scores with the submission that ranks the results using OpenAI’s Ada embeddings
of the website content and the argument—thus not using the image at all. Other submissions were
similar to the top-performing submissions from previous years. As such an approach was not successful
in earlier years, likely this year’s updated task description, which provides complete arguments instead
of mere topics, enabled the top performance of this approach. The performance of combined approaches
is yet to be tested. And as the achieved scores below 0.5 show, the identification of images that convey
a specific argument is still a very challenging task.</p>
      </sec>
    </sec>
    <sec id="sec-7">
      <title>7. Conclusion</title>
      <p>The fifth edition of the Touché lab on argumentation systems featured three tasks: (1) Human Value
Detection, (2) Ideology and Power Identification in Parliamentary Debates, and (3) Image
Retrieval/Generation for Arguments. In contrast to previous years, the focus this year was more on classification
than retrieval tasks. Furthermore, two of the three tasks were multilingual, although automatic English
transcriptions were provided to facilitate participation. We expanded the scope of Touché with the new
tasks on human values and political power and orientation. In addition, we methodically extended the
retrieval task by allowing participants to generate images instead of retrieving them. Unfortunately, no
team submitted generated images in the end.</p>
      <p>Of the 68 registered teams, 20 participated in the tasks and submitted a total of 81 runs. Participants
mainly used classification architectures, with BERT and variants still very dominant, although more
clas17https://platform.openai.com/docs/models/embeddings
1 HTW-DIL Ada-Summary
2 HTW-DIL Moondream-Text
3 HTW-DIL Moondream-Default-Image-Text
4 Baseline BM25
5 Baseline SBERT
6 DS@GT Generated-Image-CLIP
7 HTW-DIL Moondream-Image-Text-EP3
8 HTW-DIL Moondream-Image
9 DS@GT Base-CLIP
10 HTW-DIL Moondream-Image-Text-EP2
sical machine learning models were also used in the Ideology and Power Identification in Parliamentary
Debates task. Generative models, on the other hand, were rarely used. The Image Retrieval/Generation
for Arguments task changed to seeking images for a specific argument rather than a topic, and the
best-performing submission used an approach that was not successful for the previous task definitions:
it ranked images by the embedding similarity between the argument and the web page that contains
the image—and thus ignored the actual image content.</p>
      <p>We plan to continue Touché as a collaborative platform for researchers in argumentation systems.
All Touché resources are freely available, including topics, manual relevance, argument quality, and
stance judgments, and submitted runs from participating teams. These resources and other events such
as workshops will help to further foster the community working on argumentation systems.</p>
    </sec>
    <sec id="sec-8">
      <title>Acknowledgments</title>
      <p>This work was partially supported by the European Commission under grant agreement GA 101070014
(https://openwebsearch.eu) and the German Research Foundation under project 455911521 (LARGA) as
part of the SPP 1999 (RATIO). The ideology and power identification shared task has been supported by
CLARIN ERIC, under the ParlaMint project (https://www.clarin.eu/parlamint).
A. K. Ojha, A. S. Doğruöz, G. D. S. Martino, H. T. Madabushi (Eds.), Proc. of SemEval, ACL, 2023,
pp. 2287–2303. doi:10.18653/v1/2023.semeval-1.313.
[3] P. He, X. Liu, J. Gao, W. Chen, DeBERTa: decoding-enhanced BERT with disentangled attention,
in: Proc. of ICLR, 2021. URL: https://openreview.net/forum?id=XPZIaotutsD.
[4] Y. Liu, M. Ott, N. Goyal, J. Du, M. Joshi, D. Chen, O. Levy, M. Lewis, L. Zettlemoyer, V. Stoyanov,
RoBERTa: A Robustly Optimized BERT Pretraining Approach, CoRR (2019). URL: http://arxiv.org/
abs/1907.11692.
[5] A. Conneau, K. Khandelwal, N. Goyal, V. Chaudhary, G. Wenzek, F. Guzmán, E. Grave, M. Ott,
L. Zettlemoyer, V. Stoyanov, Unsupervised Cross-lingual Representation Learning at Scale, in:
D. Jurafsky, J. Chai, N. Schluter, J. R. Tetreault (Eds.), Proc. of ACL, ACL, 2020, pp. 8440–8451.
doi:10.18653/v1/2020.acl-main.747.
[6] A. Radford, et al., Learning Transferable Visual Models From Natural Language Supervision,
in: M. Meila, T. Zhang (Eds.), Proc. of ICML, volume 139, PMLR, 2021, pp. 8748–8763. URL:
https://proceedings.mlr.press/v139/radford21a.html.
[7] V. Karpukhin, et al., Dense Passage Retrieval for Open-Domain Question Answering, in: Proc. of</p>
      <p>EMNLP, ACL, 2020, pp. 6769–6781. doi:10.18653/v1/2020.emnlp-main.550.
[8] S. H. Schwartz, Are There Universal Aspects in the Structure and Contents of Human Values?,</p>
      <p>Journal of Social Issues (1994) 19–45. doi:10.1111/j.1540-4560.1994.tb01196.x.
[9] T. Bench-Capon, Persuasion in Practical Argument Using Value-based Argumentation Frameworks,</p>
      <p>Journal of Logic and Computation 13 (2003) 429–448. doi:10.1093/logcom/13.3.429.
[10] S. H. Schwartz, J. Cieciuch, M. Vecchione, E. Davidov, R. Fischer, C. Beierlein, A. Ramos,
M. Verkasalo, J.-E. Lönnqvist, K. Demirutku, et al., Refining the Theory of Basic Individual
Values, Journal of personality and social psychology (2012). doi:10.1037/a0029393.
[11] M. Scharfbillig, V. Ponizovskiy, Z. Pasztor, J. Keimer, G. Tirone, Monitoring Social Values in Online
Media Articles on Child Vaccinations, Technical Report, European Commission’s Joint Research
Centre, Luxembourg, 2022. doi:10.2760/86884.
[12] M. Scharfbillig, L. Smillie, D. Mair, M. Sienkiewicz, J. Keimer, R. Pinho Dos Santos, H. Vinagreiro
Alves, E. Vecchione, L. Scheunemann, Values and Identities - a Policymaker’s Guide, Technical
Report, European Commission’s Joint Research Centre, Luxembourg, 2021. doi:10.2760/349527.
[13] P. Ammanabrolu, L. Jiang, M. Sap, H. Hajishirzi, Y. Choi, Aligning to Social Norms and Values in
Interactive Narratives, in: M. Carpuat, M. de Marnefe, I. V. M. Ruíz (Eds.), Proc. of NAACL-HLT
2022, ACL, 2022, pp. 5994–6017. doi:10.18653/v1/2022.naacl-main.439.
[14] R. Liu, C. Jia, G. Zhang, Z. Zhuang, T. X. Liu, S. Vosoughi, Second Thoughts are Best: Learning
to Re-Align With Human Values from Text Edits, Advances in Neural Information Processing
Systems 35 (2022) 181–196.
[15] L. Qiu, Y. Zhao, J. Li, P. Lu, B. Peng, J. Gao, S. Zhu, ValueNet: A New Dataset for Human Value
Driven Dialogue System, in: Proc. of AAAI, AAAI Press, 2022, pp. 11183–11191. doi:10.1609/
aaai.v36i10.21368.
[16] M. Forbes, J. D. Hwang, V. Shwartz, M. Sap, Y. Choi, Social Chemistry 101: Learning to Reason
about Social and Moral Norms, in: B. Webber, T. Cohn, Y. He, Y. Liu (Eds.), Proc. of EMNLP, ACL,
2020, pp. 653–670. doi:10.18653/v1/2020.emnlp-main.48.
[17] J. Kiesel, M. Alshomary, N. Handke, X. Cai, H. Wachsmuth, B. Stein, Identifying the Human Values
behind Arguments, in: S. Muresan, P. Nakov, A. Villavicencio (Eds.), Proc. of ACL, ACL, 2022, pp.
4459–4471. doi:10.18653/v1/2022.acl-long.306.
[18] D. Fišer, J. Lenardič, CLARIN resources for parliamentary discourse research, in: D. Fišer,</p>
      <p>M. Eskevich, F. de Jong (Eds.), Proc. of LREC, ELRA, 2018.
[19] J. Lenardič, D. Fišer, CLARIN Resource Families: Parliamentary Corpora, 2023. https://www.clarin.</p>
      <p>eu/resource-families/parliamentary-corpora, accessed on 2024-07-09.
[20] G. Glavaš, F. Nanni, S. P. Ponzetto, Computational Analysis of Political Texts: Bridging Research
Eforts Across Communities, in: 57th Annual Meeting of the Association for Computational
Linguistics: Tutorial Abstracts, ACL, 2019, pp. 18–23. doi:10.18653/v1/P19-4004.
[21] G. Abercrombie, R. Batista-Navarro, Sentiment and position-taking analysis of parliamentary
debates: a systematic literature review, Journal of Computational Social Science 3 (2020) 245–270.
[22] A. Arian, M. Shamir, The primarily political functions of the left-right continuum, Comparative
politics 15 (1983) 139–158.
[23] F. Vegetti, D. Širinić, Left–right categorization and perceptions of party ideologies, Political</p>
      <p>Behavior 41 (2019) 257–280.
[24] T. van Dijk, Discourse and Power, Bloomsbury Publishing, 2008.
[25] N. Fairclough, Critical Discourse Analysis: The Critical Study of Language, Longman applied
linguistics, Taylor &amp; Francis, 2013. doi:10.4324/9781315834368.
[26] N. Fairclough, Language and Power, Language In Social Life, Taylor &amp; Francis, 2013. doi:10.4324/
9781315838250.
[27] M. D. Conover, B. Gonçalves, J. Ratkiewicz, A. Flammini, F. Menczer, Predicting the political
alignment of Twitter users, in: Proc. of PASSAT and SocialCom, IEEE, 2011, pp. 192–199. doi:10.
1109/PASSAT/SocialCom.2011.34.
[28] S. Gerrish, D. M. Blei, Predicting Legislative Roll Calls from Text, in: L. Getoor, T. Schefer (Eds.),</p>
      <p>Proc. of ICML, Omnipress, 2011, pp. 489–496.
[29] D. Preoţiuc-Pietro, Y. Liu, D. Hopkins, L. Ungar, Beyond Binary Labels: Political Ideology Prediction
of Twitter Users, in: R. Barzilay, M.-Y. Kan (Eds.), Proc. of ACL, ACL, 2017, pp. 729–740. doi:10.
18653/v1/P17-1068.
[30] F. Pla, L.-F. Hurtado, Political Tendency Identification in Twitter using Sentiment Analysis
Techniques, in: J. Tsujii, J. Hajic (Eds.), Proc. of Coling, Dublin City University and ACL, 2014, pp.
183–192. URL: https://aclanthology.org/C14-1019.
[31] C. Chen, D. Walker, V. Saligrama, Ideology Prediction from Scarce and Biased Supervision: Learn
to Disregard the “What” and Focus on the “How”!, in: A. Rogers, J. Boyd-Graber, N. Okazaki
(Eds.), Proc. of ACL (Volume 1: Long Papers), ACL, Toronto, Canada, 2023, pp. 9529–9549. doi:10.
18653/v1/2023.acl-long.530.
[32] J. A. García-Díaz, et al., Overview of PoliticES 2022: Spanish Author Profiling for Political Ideology,</p>
      <p>Procesamiento del Lenguaje Natural 69 (2022) 265–272. doi:10.26342/2022-69-23.
[33] D. Russo, et al., PoliticIT at EVALITA 2023: Overview of the political ideology detection in Italian
texts task, in: Proc. of EVALITA, volume 3473 of CEUR Workshop Proceedings, CEUR-WS.org, 2023.</p>
      <p>URL: https://ceur-ws.org/Vol-3473/paper7.pdf.
[34] G. M. Kurtoğlu Eskişar, Ç. Çöltekin, Emotions Running High? A Synopsis of the state of Turkish
Politics through the ParlaMint Corpus, in: D. Fišer, M. Eskevich, J. Lenardič, F. de Jong (Eds.), Proc.
of ParlaCLARIN, ELRA, 2022, pp. 61–70. URL: https://aclanthology.org/2022.parlaclarin-1.10.
[35] M. Mochtak, P. Rupnik, N. Ljubešić, The ParlaSent Multilingual Training Dataset for Sentiment
Identification in Parliamentary Proceedings, in: N. Calzolari, M.-Y. Kan, V. Hoste, A. Lenci, S. Sakti,
N. Xue (Eds.), Proc. of LREC, ELRA and ICCL, 2024, pp. 16024–16036. URL: https://aclanthology.
org/2024.lrec-main.1393.
[36] O. Tarkka, J. Koljonen, M. Korhonen, J. Laine, K. Martiskainen, K. Elo, V. Laippala, Automated
Emotion Annotation of Finnish Parliamentary Speeches Using GPT-4, in: D. Fiser, M. Eskevich,
D. Bordon (Eds.), Proc. of ParlaCLARIN, ELRA and ICCL, 2024, pp. 70–76. URL: https://aclanthology.
org/2024.parlaclarin-1.11.
[37] C. Navarretta, D. Haltrup Hansen, Government and opposition in Danish parliamentary debates,
in: D. Fiser, M. Eskevich, D. Bordon (Eds.), Proc. of ParlaCLARIN, ELRA and ICCL, 2024, pp.
154–162. URL: https://aclanthology.org/2024.parlaclarin-1.23.
[38] I. J. Dove, On images as evidence and arguments, in: F. H. van Eemeren, B. Garssen (Eds.), Topical
Themes in Argumentation Theory: Twenty Exploratory Studies, Argumentation Library, Springer
Netherlands, Dordrecht, 2012, pp. 223–238. doi:10.1007/978-94-007-4041-9_15.
[39] F. Dunaway, Images, emotions, politics, Modern American History 1 (2018) 369–376. doi:10.</p>
      <p>1017/mah.2018.17.
[40] G. Roque, Visual argumentation: A further reappraisal, in: F. H. van Eemeren, B. Garssen (Eds.),
Topical Themes in Argumentation Theory, volume 22, Springer Netherlands, 2012, pp. 273–288.
doi:10.1007/978-94-007-4041-9_18.</p>
    </sec>
    <sec id="sec-9">
      <title>A. Extended Results</title>
      <p>Team</p>
      <p>Approach</p>
      <p>F1-score
Lang. llreavO :iilttt-frceeSgoodunhh :iiiltt-frcceeSaoodnn iilttSaounm iseodnHm itceeevnhAm :irceeaoodPnnwm :ssrrrceeeoouPw ceaF :iltsrrceeSaoypun :iilttsrcceeSaoyu iitraodnT :iltfsrreooyunCm :iilttfsrrreeaoooypnnnCm iiltyuHm :ilrcceeeeagvonnnB :iilltceeeeeeavboyddpnnnB :iilssrrcceeavonnnUm :iiltssrreeaavunnUm :iilltssrrceeeaavonnUm
Lang. llreavO :iilttt-frceeSgoodunhh :iiiltt-frcceeSaoodnn iilttSaounm iseodnHm itceeevnhAm :irceeaoodPnnwm :ssrrrceeeoouPw ceaF :iltsrrceeSaoypun :iilttsrcceeSaoyu iitraodnT :iltfsrreooyunCm :iilttfsrrreeaoooypnnnCm iiltyuHm :ilrcceeeeagvonnnB :iilltceeeeeeavboyddpnnnB :iilssrrcceeavonnnUm :iiltssrreeaavunnUm :iilltssrrceeeaavonnUm</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>J.</given-names>
            <surname>Kiesel</surname>
          </string-name>
          , Ç. Çöltekin,
          <string-name>
            <given-names>M.</given-names>
            <surname>Heinrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Fröbe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Alshomary</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. D.</given-names>
            <surname>Longueville</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Erjavec</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Handke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Kopp</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ljubešić</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Meden</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Mirzakhmedova</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Morkevičius</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Reitis-Münstermann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Scharfbillig</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Stefanovitch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Wachsmuth</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          , Overview of Touché 2024:
          <article-title>Argumentation Systems</article-title>
          , in: L.
          <string-name>
            <surname>Goeuriot</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Mulhem</surname>
            ,
            <given-names>G.</given-names>
          </string-name>
          <string-name>
            <surname>Quénot</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Schwab</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Soulier</surname>
          </string-name>
          ,
          <string-name>
            <surname>G. M. D. Nunzio</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <surname>A. G. S. de Herrera</surname>
          </string-name>
          , G. Faggioli, N. Ferro (Eds.),
          <source>Experimental IR Meets Multilinguality, Multimodality, and Interaction. Proceedings of the Fifteenth International Conference of the CLEF Association (CLEF</source>
          <year>2024</year>
          ), Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>J.</given-names>
            <surname>Kiesel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Alshomary</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Mirzakhmedova</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Heinrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Handke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Wachsmuth</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          , SemEval
          <article-title>-2023 Task 4: ValueEval: Identification of Human Values behind Arguments</article-title>
          , in: R. Kumar,
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [41]
          <string-name>
            <surname>I. Grancea</surname>
          </string-name>
          ,
          <article-title>Types of visual arguments, Argumentum</article-title>
          .
          <source>Journal of the Seminar of Discursive Logic</source>
          ,
          <source>Argumentation Theory and Rhetoric</source>
          <volume>15</volume>
          (
          <year>2017</year>
          )
          <fpage>16</fpage>
          -
          <lpage>34</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [42]
          <string-name>
            <given-names>D.</given-names>
            <surname>Dimitrov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. Bin</given-names>
            <surname>Ali</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Shaar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Alam</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Silvestri</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Firooz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Nakov</surname>
          </string-name>
          , G. Da San Martino, SemEval
          <article-title>-2021 Task 6: Detection of Persuasion Techniques in Texts and Images</article-title>
          ,
          <source>in: Proc. of SemEval</source>
          , ACL,
          <year>2021</year>
          , pp.
          <fpage>70</fpage>
          -
          <lpage>98</lpage>
          . URL: https://aclanthology.org/
          <year>2021</year>
          .semeval-
          <volume>1</volume>
          .7. doi:
          <volume>10</volume>
          .18653/ v1/
          <year>2021</year>
          .semeval-
          <volume>1</volume>
          .7.
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [43]
          <string-name>
            <given-names>H.</given-names>
            <surname>Wachsmuth</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Naderi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Hou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Bilu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Prabhakaran</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. A.</given-names>
            <surname>Thijm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Hirst</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <article-title>Computational argumentation quality assessment in natural language</article-title>
          ,
          <source>in: Proc. of EACL</source>
          ,
          <year>2017</year>
          , pp.
          <fpage>176</fpage>
          -
          <lpage>187</lpage>
          . URL: https://aclanthology.org/E17-1017.
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [44]
          <string-name>
            <given-names>M.</given-names>
            <surname>Fröbe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Kolyada</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Grahm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Elstner</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Loebe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Hagen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <article-title>Continuous Integration for Reproducible Shared Tasks with TIRA.io</article-title>
          , in: J.
          <string-name>
            <surname>Kamps</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Goeuriot</surname>
            ,
            <given-names>F.</given-names>
          </string-name>
          <string-name>
            <surname>Crestani</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Maistro</surname>
            ,
            <given-names>H.</given-names>
          </string-name>
          <string-name>
            <surname>Joho</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Davis</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Gurrin</surname>
            ,
            <given-names>U.</given-names>
          </string-name>
          <string-name>
            <surname>Kruschwitz</surname>
            ,
            <given-names>A</given-names>
          </string-name>
          . Caputo (Eds.),
          <source>Proc. of ECIR, Lecture Notes in Computer Science</source>
          , Springer,
          <year>2023</year>
          , pp.
          <fpage>236</fpage>
          -
          <lpage>241</lpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>031</fpage>
          - 28241-6_
          <fpage>20</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [45]
          <string-name>
            <given-names>N.</given-names>
            <surname>Stefanovitch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Piskorski</surname>
          </string-name>
          ,
          <article-title>Holistic Inter-Annotator Agreement and Corpus Coherence Estimation in a Large-scale Multilingual Annotation Campaign</article-title>
          , in: H.
          <string-name>
            <surname>Bouamor</surname>
            ,
            <given-names>J.</given-names>
          </string-name>
          <string-name>
            <surname>Pino</surname>
            ,
            <given-names>K.</given-names>
          </string-name>
          Bali (Eds.),
          <source>Proc. of EMNLP</source>
          , ACL,
          <year>2023</year>
          , pp.
          <fpage>71</fpage>
          -
          <lpage>86</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <year>2023</year>
          .emnlp-main.
          <volume>6</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [46]
          <string-name>
            <given-names>M. V.</given-names>
            <surname>Nguyen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V. D.</given-names>
            <surname>Lai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. P. B.</given-names>
            <surname>Veyseh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. H.</given-names>
            <surname>Nguyen</surname>
          </string-name>
          , Trankit:
          <string-name>
            <given-names>A</given-names>
            <surname>Light-Weight</surname>
          </string-name>
          Transformer
          <article-title>-based Toolkit for Multilingual Natural Language Processing</article-title>
          , in: D.
          <string-name>
            <surname>Gkatzia</surname>
          </string-name>
          , D. Seddah (Eds.),
          <source>Proc. of EACL</source>
          , ACL,
          <year>2021</year>
          , pp.
          <fpage>80</fpage>
          -
          <lpage>90</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <year>2021</year>
          .eacl-demos.
          <volume>10</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [47]
          <string-name>
            <given-names>J.</given-names>
            <surname>Devlin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Chang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Toutanova</surname>
          </string-name>
          ,
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          , in: J.
          <string-name>
            <surname>Burstein</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Doran</surname>
          </string-name>
          , T. Solorio (Eds.),
          <source>Proc. of NAACL-HLT, ACL</source>
          ,
          <year>2019</year>
          , pp.
          <fpage>4171</fpage>
          -
          <lpage>4186</lpage>
          . doi:
          <volume>10</volume>
          .18653/V1/N19-1423.
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [48]
          <string-name>
            <given-names>H.</given-names>
            <surname>Yunis</surname>
          </string-name>
          , Arthur Schopenhauer at
          <article-title>Touché 2024: Multi-Lingual Text Classification Using Ensembles of Large Language Models</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          [49]
          <string-name>
            <given-names>A.</given-names>
            <surname>Aydin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Shaar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Cardie</surname>
          </string-name>
          ,
          <article-title>Edward said at touché: Human values classification</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref12">
        <mixed-citation>
          [50]
          <string-name>
            <given-names>M.</given-names>
            <surname>Morren</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Mishra</surname>
          </string-name>
          ,
          <article-title>Eric fromm at touché: Prompts vs finetuning</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref13">
        <mixed-citation>
          [51]
          <string-name>
            <given-names>S.</given-names>
            <surname>Legkas</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Christodoulou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Zidianakis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Koutrintzes</surname>
          </string-name>
          , G. Petasis,
          <string-name>
            <given-names>M.</given-names>
            <surname>Dagioglou</surname>
          </string-name>
          ,
          <article-title>Hierocles of alexandria at touché: Multi-task &amp; multi-head custom architecture with transformer-based models for human value detection</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref14">
        <mixed-citation>
          [52]
          <string-name>
            <given-names>N.</given-names>
            <surname>Goyal</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Du</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Ott</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Anantharaman</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Conneau</surname>
          </string-name>
          ,
          <article-title>Larger-Scale Transformers for Multilingual Masked Language Modeling</article-title>
          , in: A.
          <string-name>
            <surname>Rogers</surname>
            ,
            <given-names>I. Calixto</given-names>
          </string-name>
          , I. Vulic,
          <string-name>
            <given-names>N.</given-names>
            <surname>Saphra</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Kassner</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Camburu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Bansal</surname>
          </string-name>
          , V. Shwartz (Eds.),
          <source>Proc. of RepL4NLP@ACL-IJCNLP, ACL</source>
          ,
          <year>2021</year>
          , pp.
          <fpage>29</fpage>
          -
          <lpage>33</lpage>
          . doi:
          <volume>10</volume>
          . 18653/V1/
          <year>2021</year>
          .REPL4NLP-1.4.
        </mixed-citation>
      </ref>
      <ref id="ref15">
        <mixed-citation>
          [53]
          <string-name>
            <given-names>V.</given-names>
            <surname>Yeste</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. C.</given-names>
            <surname>Ardanuy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Rosso</surname>
          </string-name>
          ,
          <article-title>Philo of Alexandria at Touché: A Cascade Model Approach to Human Value Detection</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref16">
        <mixed-citation>
          [54]
          <string-name>
            <surname>P. K</surname>
          </string-name>
          ,
          <string-name>
            <surname>D. K</surname>
          </string-name>
          ,
          <string-name>
            <surname>C. Reddy</surname>
            ,
            <given-names>A.</given-names>
          </string-name>
          <article-title>M, SCaLAR NITK at Touché: Human Value Detection</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref17">
        <mixed-citation>
          [55]
          <string-name>
            <given-names>N.</given-names>
            <surname>Mirzakhmedova</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kiesel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Alshomary</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Heinrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Handke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Cai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Barriere</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Dastgheib</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Ghahroodi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Sadraei</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Asgari</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Kawaletz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Wachsmuth</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <article-title>The Touché23- ValueEval Dataset for Identifying Human Values behind Arguments</article-title>
          , in: N.
          <string-name>
            <surname>Calzolari</surname>
            , M.-
            <given-names>Y.</given-names>
          </string-name>
          <string-name>
            <surname>Kan</surname>
            ,
            <given-names>V.</given-names>
          </string-name>
          <string-name>
            <surname>Hoste</surname>
            ,
            <given-names>A.</given-names>
          </string-name>
          <string-name>
            <surname>Lenci</surname>
            ,
            <given-names>S.</given-names>
          </string-name>
          <string-name>
            <surname>Sakti</surname>
          </string-name>
          , N. Xue (Eds.),
          <source>Proceedings of the 2024 Joint International Conference on Computational Linguistics</source>
          ,
          <article-title>Language Resources and Evaluation (LREC-COLING</article-title>
          <year>2024</year>
          ),
          <source>International Committee on Computational Linguistics</source>
          ,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref18">
        <mixed-citation>
          [56]
          <string-name>
            <given-names>T.</given-names>
            <surname>Erjavec</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Ogrodniczuk</surname>
          </string-name>
          , et al.,
          <source>The ParlaMint corpora of parliamentary proceedings, LREC</source>
          <volume>57</volume>
          (
          <year>2022</year>
          )
          <fpage>415</fpage>
          -
          <lpage>448</lpage>
          . doi:
          <volume>10</volume>
          .1007/s10579-021-09574-0.
        </mixed-citation>
      </ref>
      <ref id="ref19">
        <mixed-citation>
          [57]
          <string-name>
            <surname>Ç. Çöltekin</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Kopp</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Katja</surname>
            ,
            <given-names>V.</given-names>
          </string-name>
          <string-name>
            <surname>Morkevicius</surname>
            ,
            <given-names>N.</given-names>
          </string-name>
          <string-name>
            <surname>Ljubešić</surname>
          </string-name>
          , T. Erjavec, Multilingual Power and
          <article-title>Ideology identification in the Parliament: a reference dataset and simple baselines</article-title>
          , in: D.
          <string-name>
            <surname>Fiser</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Eskevich</surname>
          </string-name>
          , D. Bordon (Eds.), 4th Workshop on Creating, Analysing, and
          <article-title>Increasing Accessibility of Parliamentary Corpora, ELRA</article-title>
          and ICCL,
          <year>2024</year>
          , pp.
          <fpage>94</fpage>
          -
          <lpage>100</lpage>
          . URL: https://aclanthology.org/
          <year>2024</year>
          . parlaclarin-
          <volume>1</volume>
          .
          <fpage>14</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref20">
        <mixed-citation>
          [58]
          <string-name>
            <given-names>O.</given-names>
            <surname>Palmqvist</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Jiremalm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Picazo-Sanchez</surname>
          </string-name>
          , Policy Parsing Panthers at Touché:
          <article-title>Ideology and Power Identification in Parliamentary Debates</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref21">
        <mixed-citation>
          [59]
          <string-name>
            <given-names>T.</given-names>
            <surname>Mesnard</surname>
          </string-name>
          , et al.,
          <source>Gemma: Open Models Based on Gemini Research and Technology</source>
          ,
          <year>2024</year>
          . doi:
          <volume>10</volume>
          .48550/arXiv.2403.08295. arXiv:
          <volume>2403</volume>
          .
          <fpage>08295</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref22">
        <mixed-citation>
          [60]
          <string-name>
            <given-names>P.</given-names>
            <surname>Mirunalini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Koushik</surname>
          </string-name>
          , D. S,
          <string-name>
            <given-names>D.</given-names>
            <surname>Seshan</surname>
          </string-name>
          , Trojan Horses at Touché:
          <article-title>Logistic regression for classification of political debates</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref23">
        <mixed-citation>
          [61]
          <string-name>
            <given-names>J.</given-names>
            <surname>Hariharakrishnan</surname>
          </string-name>
          ,
          <string-name>
            <surname>J. S</surname>
          </string-name>
          , P. Mirunalini, Pixel Phantoms at Touché:
          <article-title>Ideology and power identification in parliamentary debates using linear SVC</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref24">
        <mixed-citation>
          [62]
          <string-name>
            <given-names>V.</given-names>
            <surname>Sanh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Debut</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Chaumond</surname>
          </string-name>
          , T. Wolf,
          <article-title>DistilBERT, a distilled version of BERT: smaller, faster, cheaper</article-title>
          and lighter,
          <year>2020</year>
          . arXiv:
          <year>1910</year>
          .01108.
        </mixed-citation>
      </ref>
      <ref id="ref25">
        <mixed-citation>
          [63]
          <string-name>
            <given-names>N.</given-names>
            <surname>Reimers</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Gurevych</surname>
          </string-name>
          , Sentence-BERT:
          <article-title>Sentence Embeddings using Siamese BERT-Networks</article-title>
          ,
          <source>in: Proc. of EMNLP</source>
          , ACL,
          <year>2019</year>
          , pp.
          <fpage>3982</fpage>
          -
          <lpage>3992</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <fpage>D19</fpage>
          -1410.
        </mixed-citation>
      </ref>
      <ref id="ref26">
        <mixed-citation>
          [64]
          <string-name>
            <surname>K</surname>
          </string-name>
          . V,
          <string-name>
            <surname>K. S</surname>
          </string-name>
          , K. A,
          <string-name>
            <surname>M. P</surname>
            ,
            <given-names>S. N</given-names>
          </string-name>
          , Ssnites at Touché:
          <article-title>Ideology and power identification in parliamentary debates using BERT model</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref27">
        <mixed-citation>
          [65]
          <string-name>
            <given-names>S.</given-names>
            <surname>Sevitha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Patel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Shevgoor</surname>
          </string-name>
          , Team Hale Lab at Touché 2024:
          <article-title>Ideology and power identification in parliamentary debates</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref28">
        <mixed-citation>
          [66]
          <string-name>
            <given-names>M.</given-names>
            <surname>Artetxe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schwenk</surname>
          </string-name>
          ,
          <article-title>Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond, Transactions of the Association for Computational Linguistics 7 (</article-title>
          <year>2019</year>
          )
          <fpage>597</fpage>
          -
          <lpage>610</lpage>
          . doi:
          <volume>10</volume>
          .1162/tacl_a_
          <fpage>00288</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref29">
        <mixed-citation>
          [67]
          <string-name>
            <given-names>S.</given-names>
            <surname>Shwetha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Kamath</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Balaji</surname>
          </string-name>
          ,
          <string-name>
            <surname>S. N. S. R</surname>
          </string-name>
          , S. Narayanan, Vayam Solve Kurmaha at Touché:
          <article-title>Power Identification in Parliamentry Speeches Using TFIDF Vectorizer and</article-title>
          SVM Classifier, in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref30">
        <mixed-citation>
          [68]
          <string-name>
            <given-names>C.</given-names>
            <surname>Gerber</surname>
          </string-name>
          , gerber at Touché:
          <article-title>Ideology and power identification in parliamentary debates 2024</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref31">
        <mixed-citation>
          [69]
          <string-name>
            <given-names>A.</given-names>
            <surname>Khurshid</surname>
          </string-name>
          ,
          <string-name>
            <surname>D. Das</surname>
            ,
            <given-names>R.</given-names>
          </string-name>
          <string-name>
            <surname>Khaskel</surname>
          </string-name>
          , S. Datta,
          <article-title>JU_NLP_DID at Touché</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref32">
        <mixed-citation>
          [70]
          <string-name>
            <given-names>M.</given-names>
            <surname>Andruszak</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Alhamzeh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Egyed-Zsigmond</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Carlsson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Leydet</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Otiefy</surname>
          </string-name>
          ,
          <article-title>Team INSA Passau at Touché: Multi-lingual parliamentary speech classification</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref33">
        <mixed-citation>
          [71]
          <string-name>
            <given-names>I.</given-names>
            <surname>Chalkidis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Fergadiotis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Malakasiotis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Aletras</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Androutsopoulos</surname>
          </string-name>
          , LEGAL-BERT:
          <article-title>The muppets straight out of law school</article-title>
          , in: T. Cohn,
          <string-name>
            <given-names>Y.</given-names>
            <surname>He</surname>
          </string-name>
          , Y. Liu (Eds.),
          <source>Findings of ACL: EMNLP</source>
          <year>2020</year>
          , ACL,
          <year>2020</year>
          , pp.
          <fpage>2898</fpage>
          -
          <lpage>2904</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <year>2020</year>
          .findings-emnlp.
          <volume>261</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref34">
        <mixed-citation>
          [72]
          <string-name>
            <given-names>L.</given-names>
            <surname>Zheng</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Guha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. R.</given-names>
            <surname>Anderson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Henderson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D. E.</given-names>
            <surname>Ho</surname>
          </string-name>
          ,
          <article-title>When does pretraining help?: assessing self-supervised learning for law and the CaseHOLD dataset of 53,000+ legal holdings</article-title>
          ,
          <source>in: Proc. of ICAIL</source>
          , ACM,
          <year>2021</year>
          , pp.
          <fpage>159</fpage>
          -
          <lpage>168</lpage>
          . doi:
          <volume>10</volume>
          .1145/3462757.3466088.
        </mixed-citation>
      </ref>
      <ref id="ref35">
        <mixed-citation>
          [73]
          <string-name>
            <given-names>H.</given-names>
            <surname>Touvron</surname>
          </string-name>
          , et al.,
          <source>LLaMA: Open and Eficient Foundation Language Models</source>
          ,
          <year>2023</year>
          . doi:
          <volume>10</volume>
          .48550/ arxiv.2302.13971.
        </mixed-citation>
      </ref>
      <ref id="ref36">
        <mixed-citation>
          [74]
          <string-name>
            <given-names>C.</given-names>
            <surname>Rapp</surname>
          </string-name>
          , Aristotle's Rhetoric, in: E. N.
          <string-name>
            <surname>Zalta</surname>
          </string-name>
          , U. Nodelman (Eds.),
          <source>The Stanford Encyclopedia of Philosophy</source>
          , Metaphysics Research Lab, Stanford University,
          <year>2023</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref37">
        <mixed-citation>
          [75]
          <string-name>
            <given-names>J. E.</given-names>
            <surname>Kjeldsen</surname>
          </string-name>
          ,
          <article-title>Virtues of visual argumentation: How pictures make the importance and strength of an argument salient</article-title>
          ,
          <year>2013</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref38">
        <mixed-citation>
          [76]
          <string-name>
            <given-names>J.</given-names>
            <surname>Achiam</surname>
          </string-name>
          , et al.,
          <source>GPT-4 Technical Report</source>
          ,
          <year>2024</year>
          . arXiv:
          <volume>2303</volume>
          .
          <fpage>08774</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref39">
        <mixed-citation>
          [77]
          <string-name>
            <given-names>H.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y. J.</given-names>
            <surname>Lee</surname>
          </string-name>
          , Visual instruction tuning,
          <year>2023</year>
          . arXiv:
          <volume>2304</volume>
          .
          <fpage>08485</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref40">
        <mixed-citation>
          [78]
          <string-name>
            <given-names>B.</given-names>
            <surname>Ostrower</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Aphiwetsa</surname>
          </string-name>
          , Ds@gt at touché:
          <article-title>Image search and ranking via clip and image generation</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref41">
        <mixed-citation>
          [79]
          <string-name>
            <given-names>R.</given-names>
            <surname>Rombach</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Blattmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Lorenz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Esser</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Ommer</surname>
          </string-name>
          ,
          <article-title>High-resolution image synthesis with latent difusion models</article-title>
          ,
          <source>in: Proc. of CVPR</source>
          , IEEE,
          <year>2022</year>
          , pp.
          <fpage>10674</fpage>
          -
          <lpage>10685</lpage>
          . doi:
          <volume>10</volume>
          .1109/CVPR52688.
          <year>2022</year>
          .
          <volume>01042</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref42">
        <mixed-citation>
          [80]
          <string-name>
            <given-names>T.</given-names>
            <surname>Janusko</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Kämpf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Keiling</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Knick</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D. S. M.</given-names>
            <surname>Thiele</surname>
          </string-name>
          ,
          <article-title>Htw-dil at touché: Multimodal dense information retrieval for arguments</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          , A. G. S. de Herrera (Eds.),
          <source>Working Notes of the Conference and Labs of the Evaluation Forum (CLEF</source>
          <year>2024</year>
          ), CEUR Workshop Proceedings, CEUR-WS.org,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref43">
        <mixed-citation>
          [81]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Bubeck</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Eldan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. D.</given-names>
            <surname>Giorno</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Gunasekar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y. T.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <source>Textbooks Are All You Need II: phi-1.5 technical report</source>
          ,
          <year>2023</year>
          . URL: https://arxiv.org/abs/2309.05463.
        </mixed-citation>
      </ref>
      <ref id="ref44">
        <mixed-citation>
          [82]
          <string-name>
            <given-names>X.</given-names>
            <surname>Zhai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Mustafa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Kolesnikov</surname>
          </string-name>
          , L. Beyer,
          <article-title>Sigmoid loss for language image pre-training</article-title>
          ,
          <source>in: Proc. of ICCV, IEEE Computer Society</source>
          ,
          <year>2023</year>
          , pp.
          <fpage>11941</fpage>
          -
          <lpage>11952</lpage>
          . doi:
          <volume>10</volume>
          .1109/iccv51070.
          <year>2023</year>
          .
          <volume>01100</volume>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>