<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Extended overview of the CLEF 2024 LongEval Lab on Longitudinal Evaluation of Model Performance</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Rabab Alkhalifa</string-name>
          <email>raalkhalifa@iau.edu.sa</email>
          <xref ref-type="aff" rid="aff3">3</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hsuvas Borkakoty</string-name>
          <email>borkakotyh@cardif.ac.uk</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Romain Deveaud</string-name>
          <email>r.deveaud@qwant.com</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alaa El-Ebshihy</string-name>
          <email>alaa.el-ebshihy@researchstudio.at</email>
          <xref ref-type="aff" rid="aff6">6</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Luis Espinosa-Anke</string-name>
          <email>espinosa-ankel@cardif.ac.uk</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tobias Fink</string-name>
          <email>tobias.fink@researchstudio.at</email>
          <xref ref-type="aff" rid="aff6">6</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Petra Galuščáková</string-name>
          <email>petra.galuscakova@uis.no</email>
          <xref ref-type="aff" rid="aff10">10</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Gabriela Gonzalez-Saez</string-name>
          <email>gabriela-nicole.gonzalez-saez@univ-grenoble-alpes.fr</email>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Lorraine Goeuriot</string-name>
          <email>lorraine.goeuriot@univ-grenoble-alpes.fr</email>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>David Iommi</string-name>
          <email>david.iommi@researchstudio.at</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Maria Liakata</string-name>
          <email>m.liakata@qmul.ac.uk</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff11">11</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Harish Tayyar Madabushi</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Pablo Medina-Alias</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Philippe Mulhem</string-name>
          <email>Philippe.Mulhem@imag.fr</email>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Florina Piroi</string-name>
          <email>florina.piroi@researchstudio.at</email>
          <xref ref-type="aff" rid="aff6">6</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Martin Popel</string-name>
          <email>popel@ufal.mf.cuni.cz</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Arkaitz Zubiaga</string-name>
          <email>a.zubiaga@qmul.ac.uk</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Alan Turing Institute</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Cardif University</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Charles University</institution>
          ,
          <addr-line>Prague</addr-line>
          ,
          <country country="CZ">Czech Republic</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Imam Abdulrahman Bin Faisal University</institution>
          ,
          <addr-line>SA</addr-line>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Institute of Engineering Univ. Grenoble Alpes. CLEF 2024: Conference and Labs of the Evaluation Forum</institution>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Qwant</institution>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Research Studios Austria, Data Science Studio</institution>
          ,
          <addr-line>Vienna, AT</addr-line>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>TU Wien</institution>
          ,
          <country country="AT">Austria</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>Univ. Grenoble Alpes</institution>
          ,
          <addr-line>CNRS, Grenoble INP</addr-line>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>University of Bath</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff10">
          <label>10</label>
          <institution>University of Stavanger</institution>
          ,
          <addr-line>Stavanger</addr-line>
          ,
          <country country="NO">Norway</country>
        </aff>
        <aff id="aff11">
          <label>11</label>
          <institution>University of Warwick</institution>
          ,
          <country country="UK">UK</country>
        </aff>
      </contrib-group>
      <abstract>
        <p>We describe the second edition of the LongEval CLEF 2024 shared task. This lab evaluates the temporal persistence of Information Retrieval (IR) systems and Text Classifiers. Task 1 requires IR systems to run on corpora acquired at several timestamps, and evaluates the drop in system quality (NDCG) along these timestamps. Task 2 tackles binary sentiment classification at diferent points in time, and evaluates the performance drop for diferent temporal gaps. Overall, 37 teams registered for Task 1 and 25 for Task 2. Ultimately, 14 and 4 teams participated in Task 1 and Task 2, respectively.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;Evaluation</kwd>
        <kwd>Temporal Persistence</kwd>
        <kwd>Temporal Generalisability</kwd>
        <kwd>Information Retrieval</kwd>
        <kwd>Text Classification</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        Outside the strict scientific context, the European Artificial Intelligence Act 1, adopted by European
Commission in 2024, stresses in Article 17, section (d), that providers must comply with “examination,
test and validation procedures to be carried out before, during and after the development of the high-risk
AI system, and the frequency with which they have to be carried out”. Without focusing here on the
degree of risk of Information Retrieval or Classification systems, this Act clearly states that AI systems
must tackle evolution. Time is a dimension that is often overlooked when conducting Information
Retrieval (IR) experiments, especially when static data sets are utilized. The advantages of such datasets
are that they are easily used to evaluate and test systems. Some data sets, like CORD19, contain
documents collected at diferent points in time, showing diferences in the set of documents from one
collection time to another. Recent research [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ] has demonstrated that models trained on data pertaining
to a particular time period struggle to keep their performance levels when applied on test data that is
distant in time. On the other side, [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ] showed that neural systems, especially transformers-based ones,
are not always very sensitive to corpus evolution.
      </p>
      <p>With the aim of tackling this challenge of making models have persistent quality over time, the
objective of the LongEval lab is twofold: (i) to explore the extent to which temporal diferences over
time, as reflected in the evolution of evaluation datasets, results in the deterioration of the performance
of information retrieval and classification systems, and (ii) to propose improved methods that mitigate
performance drop by making models more robust over time.</p>
      <p>
        The LongEval lab [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ] took place as part of the Conference and Labs of the Evaluation Forum (CLEF)
2024, and consisted in two separate tasks: (i) Task 1, described in Section 2, focused on information
retrieval, and (ii) Task 2, described in Section 3, focused on text classification for sentiment analysis.
Both tasks provided labeled datasets enabling analysis and evaluation of models over data evolving in
time (what we call “longitudinally evolving data”). In this paper, we add details to [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ], by focusing on
the datasets statistics, and on analysing in details the overall participant runs and results for each task.
      </p>
    </sec>
    <sec id="sec-2">
      <title>2. Task 1 - Retrieval</title>
      <p>
        The retrieval task of LongEval 2024 explores the efect of changes in datasets on retrieval of text
documents. More specifically, we focus on a setup in which the datasets are evolving, as in the
LongEval 2023 Retrieval Task data [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ]. This means, that one dataset can be acquired from another by
adding, removing (and replacing) a limited number of documents and queries. The two main scenarios
considered focus on one single system or on several ones, as detailed below:
      </p>
      <sec id="sec-2-1">
        <title>A single system in an evolving setup</title>
        <p>We explore how one selected system behaves when evaluated on several collections, which evolve
along the time. The context in which this task taked place is retrieval performances for Web search.
When considering evolution of Web data along time, we are facing a case when the documents, the
queries and also the relevance continuously evolves. We are then studying how Web search engines
deal with this situation. The considered scenario is then similar to classical ad-hoc search, in the case
of evolving data sets. The evaluation in this scenario consider both the Web search case in which the
top documents are the most important elements considered, and should take into account the evolving
nature of the data. Evaluation should ideally reflect the changes in the collection and especially signal
substantial changes that could lead to performance drop. This would allow to re-train the search engine
model then and only when it is really necessary, and enable much more eficient overall training.</p>
        <p>As described earlier, there is no consensus about the stability of the performance of the neural
networks IR systems along time, but it seems to be lower than in the case of statistical models.
Moreover, the performance strongly depends on the data used for training the neural model. One</p>
        <sec id="sec-2-1-1">
          <title>1https://www.europarl.europa.eu/doceo/document/TA-9-2024-0138_EN.html</title>
          <p>objective of the task is to explore the behavior of the neural system in the evolving data scenario.</p>
        </sec>
      </sec>
      <sec id="sec-2-2">
        <title>Comparison of multiple systems in an evolving setup</title>
        <p>
          While in the first point, we explore a single system, comparison of this systems with multiple systems
across evolving collections, should provide more information about systems stability and robustness.
2.1. Description of the task
Compared to the LongEval 2023 Dataset [
          <xref ref-type="bibr" rid="ref3">3</xref>
          ], in 2024 we take larger lags between the training and the
test sets. More precisely, the task is composed of:
• One training set, that contains Web documents, actual user’s queries, and assessments, acquired
at timestamp ;
• Two test sets, acquired later than  at time ′ and ”, composed of Web documents and user’s
queries.
        </p>
        <p>The task datasets were created over sequential time periods, which allows doing observations at diferent
time stamps , and most importantly, comparing the performance across diferent time stamps  and
′. So, the IR task aims to assess the performance diference between ′ and ” when ′ occurs after ′,
according to teh fact that training set acquired at , takes place few months before ′.
2.2. Dataset
As for LongEval 2023, in 2024 the data for this task were provided by the French search engine Qwant.
They consist of the queries issued by the users of this search engine, cleaned Web documents, which
were 1) selected to correspond to the queries, and 2) to add additional noise, and relevance judgments,
which were created using a click model. The dataset is fully described in [5]. We provided training data,
which included 599 train queries, with corresponding 9,785 relevance assessments and 2,049,729 Web
pages. All training data were collected during January 2023. The test set corpus is composed of two
subsets: Lag6 acquired in June 2023 (i.e., 6 months later than the training set), and Lag8 acquired in
August 2024 (i.e. acquired 8 months later than the training set). The test dataset contains 4,321,642
documents (June: 1,790,028; August: 2,531,614) and 1,925 test queries (June: 407; August: 1,518). The
datasets are accessible through the lab’s webpage2 and from the TU Wien Research Data Repository3.</p>
        <p>The data collected from the Qwant search engine is in French. In a way to help participants, the
LongEval data set for the Retrieval task also contains automatic translations into English of both queries
and documents. We mention however that the translations provided by LongEval are only applied to
the first 500 characters of each sentence of the initial French documents downloaded.</p>
        <p>The document and query overlap ratios between the collections is given by Table 1 and Table 2.
We see from these tables that there is a substantial overlap between the Train and the Test collection
documents and (due to the larger size of the August query set) a substantial overlap between the Train /
June queries and the August queries.
Train 2024
June (Lag6)
August (Lag8)
1.00
0.77
0.75
0.67
1.00
0.69
0.93
0.97
1.00</p>
        <p>To evaluate the submissions we use one set of relevance judgments: the judgments acquired by the
Qwant click model. For the evaluation, we use the NDCG measure (calculated for each dataset) at 10, as
well as the drop between the Lag8 and Lag6 collection. This allows us to check to which extend the IR
system face the evolution of the data. We also plan to use manual assessments, acquired through the
interface described in section 2.8.
2.3. Submissions
14 teams submitted their systems to the Retrieval task. Each team was allowed to submit up to 10
systems. Together, this a overall of 73 runs submitted. Two teams submitted their runs on the wrong
test data set, so we do not include their submission results in our further analysis.
2.4. Absolute Scores
For the Retrieval task of the LongEval lab, we computed two sets of scores for each of the lags in the
test collection, namely NDCG and MAP. Table 3 gives the overview of them for each run on the Lag6
and Lag8 datasets. For each run, the columns of the table indicate which language was used (English,
French, or both), whether neural approaches were involved (values yes/no), and whether a single or
a combination of several approaches was used (values yes/no). In addition, we show NDCG score
histograms for these runs, in decreasing order, for each dataset, showing whether a run uses any neural
approach (green for yes, yellow for no) in Figure 1, and whether the run uses a combination of more
than a single approach (orange for yes, cyan for no) in Figure 2. This information was acquired from
the participants through a questionnaire the participants had to fill for each submitted run. Figure 3
shows which language each made use of.</p>
        <p>From Table 3 we see that the systems which did best for the Lag6 data are also among the top for the
Lag8, where the first ranked nine systems scores are comparable to each other. For instance, the best
system on Lag6, according to the NDCG measure, (dam_run_4), is ranked the second best also on Lag8.
Similarity, the best system on Lag8, according to the NDCG measure, (mouse_run_8), is ranked the
second best also on Lag6. This finding holds for the MAP measure as well.</p>
        <p>Here, we describe the methods used in the top-3 runs, according to the NDCG evaluation measure,
for both Lag6 and Lag8 datasets.</p>
        <p>1. dam_run_4 from the DAM team: This system uses BM25 as a first stage retrieval model, enhanced
with proximity search, query expansion via synonyms, and the MBNET model [6], which combines
BERT and XLNET, for re-ranking the results.
2. mouse_run_8 from MOUSE team: This system also uses BM25 as a first stage retrieval model,
enhanced with an LLM-based re-ranking model using the Cohere API4. It utilizes the Llama 3
model [7] for query expansion.
3. mouse_run_10 from MOUSE team: Similar to mouse_run_8, this system uses BM25 as first stage
retrieval model, but it is enhanced with a deep neural-based re-ranking model using PyGaggle. It
also employs the Llama 3 model for query expansion.</p>
        <p>For the Lag8 dataset, the top-3 systems are:
1. mouse_run_9 from MOUSE team: This system uses BM25 as a first stage retrieval model, enhanced
with a deep neural-based re-ranking model using PyGaggle5. It uses the Mixtral model [8] for
query expansion.</p>
        <sec id="sec-2-2-1">
          <title>2. mouse_run_8 from MOUSE team: Described above.</title>
        </sec>
        <sec id="sec-2-2-2">
          <title>3. mouse_run_10 from MOUSE team: Described above.</title>
        </sec>
        <sec id="sec-2-2-3">
          <title>4https://docs.cohere.com/docs/rerank-2 5https://github.com/castorini/pygaggle</title>
          <p>Generally, most of the solutions chosen by the participants to the LongEval Retrieval task apply a
multi-stage retrieval approach. Often, the first stage involves a lexical-based retrieval (e.g., BM25), and
query expansion methods like PL2 or BO1. Query expansion is also done by employing Large Language
Models, like Mistral or Llama 3. Reranking is done either using neural-based methods or sentence
based transformers. Listwise rerankers and fusing have also been used in reranking of retrieved results.
Notably, the temporal aspect of the LongEval test collection has been used by some participants to
include past query relevance information into query reformulation either from clicklogs or from the
documents deemed relevant in the previous</p>
          <p>Considering the Figures 1, 2 and 3, we see that the shape of the distribution of the NDCG values are
similar for the Lag6 and Lag8 datasets. However, the systems have higher performances on Lag6 than
on Lag8, with maximum 0.4 value for the NDCG on the Lag6 versus 0.3 for the Lag8.
2.5. Changes in the Scores
The main part of the retrieval task is to study the changes in the performance scores between the
collections. The collections were created using the same approach and procedure have a relatively
high overlap in terms of both queries and documents (see Tables 1 and 2), we thus provide the Relative
NDCG Drop (RND) values of systems between the collections Lag8 and Lag6. RnD(r) for a system , is
defined as as:
 () = NDCG6()− NDCG8()</p>
          <p>NDCG6()</p>
          <p>With such definition, small RND values man more robust systems against changes, and large RND
values mean that the systems are not able to generalize well between lag6 and lag8. What we see in
Table 4 is that the systems which are more robust to the evolution of the test collections (low values on
RND) are not the best ones: for instance, ows_run_4 is the more robust system but the third worse one
in table 3. The best systems in term of NDCG values in lag6, _4 and __8, have an
RND of 0.245, which means that they quite robust, but much less than the most robut ones. This shows
that the very best systems do cope with some extend to the evolution of the corpus, but that their is
room for improving best systems against robustness. We also see that the worse robust system against
changes, cir_run_3, is a system that does not rely on neural IR models: such finding shows that neural
models are also likely to be more robust against changes than non-neural ones.
2.6. Run Rankings
Another point of view studied is how the submitted runs compare to each other, either in terms of
the absolute NDCG scores achieved on the collections, or in terms of NDCG changes between the
collections. We also calculated the Pearson correlation between the runs (now shown here), with high
correlation in terms of NDCG scores, 0.99, and similarly high, 0.98, with respect to ranking order. This
corresponds to the relatively high overlaps of the documents and also the queries between Lag6 and
Lag8 collections (Table 1 and Table 2). This observation does not hold for the correlation between the
ranking according to the NDCG score achieved and the ranking of the performance change, which is
relatively low. The Pearson correlation is 0.07 for the Lag6 dataset and -0.05 on the Lag8 dataset.</p>
          <p>Last, we calculated a combination of both rankings (ranking in terms of absolute values and ranking
in terms of change). For this, we first calculated a Borda count of the ranking in terms of absolute
values and Borda count of the ranking in terms of relative change and then we simply summed these
two Borda counts: this result is displayed in the last column in the Table 5. We see that in terms of this
measure the top performing systems (on Lag6 and Lag8 datasets) are ranked higher, although they have
lower rank in terms of the rank of the NDCG change.</p>
          <p>RND
2
3
7
1
5
8
9
6
10
12
14
15
19
17
20
18
16
13
11
27
23
26
21
22
25
24
34
29
33
28
35
32
31
30
39
43
42
38
41
40
37
36
45
47
44
48
46
49
54
52
50
53
2.7. Queries Overview
We further investigate performance on the provided queries. Due to the space reason, we only investigate
a selected subset of queries from each collection. We used a pooling strategy to select these queries
to be used for the manual assessment process (described in Section 2.8). We first selected the top
ifve performing runs on the average NDCG performance on both collections. We then calculated the
performance of these runs per queries for each collection (i.e. Lag6 and Lag8) and sorted the queries
based on their NDCG performance for the five runs. Then, we divided the query set in each collection
to four sets and randomly selected from each set: five and 10 queries from Lag 6 and Lag8, respectively.
We selected in total 20 queries from Lag6 collection and 40 Lag8 collection. We selected more queries
from Lag8 collection since, as shown in Table 2, the number of Lag8 collection is higher than Lag6
collection.</p>
          <p>Overview of the scores achieved for the selected queries in each collection is displayed in Figure 4.
The figure shows minimum performance (by any submitted run), 25%, quantile, 75% quantile and the
maximum achieved NDCG score. Due to a relatively large number of runs, the range of the scores
achieved is typically quite large and for some of the queries it even ranges between 0 and 0.8. It can be
also noticed that the variation (corresponding to the size of the boxplot) of the query performance for
the Lag8 collection is higher than Lag6 collection.</p>
          <p>Some of the worst performing queries are very general (“birdsong”, “taxes”, and “used car” for
instance) and can thus be expected to be ambiguous. This is in contrast with the top performing queries
(e.g. “camping concarneau”, “Prune rabbit”, and “point bordeaux vision”) which refer to more specific
information need. Some other top performing queries have high variation in the results, e.g. the query
“origami bird” for which it is not specified if the user focuses about about "origami bird" or looks for
tutorials to make them.
2.8. Manual relevance judgments acquisition
The evaluation results of LongEval IR task presented above rely on automatic assessments generated
from click models [5]. In addition to these click-based relevance assessments, we have set up an
annotation tool to acquire further relevance assessments by humans. For that, we used the open source
annotation tool, Doctag [17], on a sample of the queries selected in section 2.7 (60 queries in total).</p>
          <p>Doctag provides a customizable and portable platform specifically designed for Information Retrieval
(IR) evaluation. To perform manual relevance judgments using Doctag, annotators utilize its web-based
interface. They access the tool and interact with its annotation functionalities, including the assignment
of labels to indicate document relevance to specific queries. Annotators view the documents and
associate appropriate relevance labels (Fig. 5). The documents to be annotated were selected through
pooling the participants runs [18]. For the annotation to remain tractable, we conducted a stratified
sampling and selected 60 queries for evaluation (Section 2.7). We set up dedicated online servers where
Doctag is deployed, through their use we have acquired over 25K manual assessments. 2900 documents
from the original dataset were then assessed. The average number of assessments per query is around
429. To perform the manual annotation and assess document relevance for the corresponding queries,
we assigned subsets of the document dataset to a team of 25 annotators. We set up dedicated online
servers where Doctag was deployed. Each annotator was assigned to a specific server to perform the
annotation tasks. This distributed setup allowed for parallel processing, enabling annotators to work
simultaneously and collaborate efectively within their assigned subsets.</p>
          <p>We have recorded an aggregate of 25,759 judgments. These judgments span across four distinct
categories: ’Relevant’, ’Not Relevant’, ’Partially Relevant’, and ’I Don’t Know’.</p>
          <p>Preliminary analysis of the data indicates a more balanced approach among annotators in categorizing
the query-document pairs. Figure 6 presents the judgment distribution for the top 30 queries in terms of
document count. What we observe in Figure 6 is a more evenly distributed number of relevant (green)
and non-relevant (red) documents for many queries. While some queries still show a high number
of relevant documents (with peaks exceeding 300 relevant documents), the number of non-relevant
documents is also significant, indicating no single dominant category. This balanced distribution of
relevant and non-relevant documents is much more equitable than previous analyses, where
nonrelevant judgments predominated.</p>
          <p>Additionally, Figure 7 provides a detailed view of the distribution of judgment counts across all
queries using violin plots. The violin plots reveal that the distributions for relevant and non-relevant
judgments are quite similar, with both categories showing a wide range of counts and high densities
around the median values. The partially relevant category, while also having a substantial number of
judgments, shows a narrower distribution, indicating less variability. The "I don’t know" category has a
very narrow distribution, reflecting its infrequent use among annotators.</p>
          <p>Further evaluation rounds utilizing the collected data are in progress. We will utilize the annotated
documents and relevance annotations from the queries to construct an aggregated  file. With this
Qrel file, we will run the evaluation using trec_eval6 on the participants’ runs. Trec_eval will compare
the system’s retrieved results against the ground truth relevance judgments defined in the Qrel file. This
evaluation process will provide valuable insights by comparing the results of the clic model with the
manual annotations, thereby assessing the efectiveness and performance of the information retrieval
system in relation to the specified queries.
2.9. Discussion and conclusion
This task was the second attempt to collectively investigate the impact of the evolution of the data on
search system’s performances. Having 14 participating teams submitting runs confirmed that this topic
was of interest to the community.</p>
          <p>The dataset released for this task consisted in a sequence of test collections corresponding to diferent
times. The collections were composed of documents and queries coming from Qwant, and relevance
judgment coming from a click model and manual assessment. While the manual assessment is ongoing
at the time of the paper’s publication, performances of participants’ submitted runs were measured
using the click logs.</p>
          <p>Most of submitted runs rely on multi-stage retrieval approaches. In addition to the usage of Large
Language Models in Query expansion. The efect of the translation of the documents and queries
provided by the lab has a clear impact: the best results were obtained on the original French data.</p>
          <p>Since each subset had substantial overlaps, the correlations between systems rankings was pretty
high. As for the robustness of the systems towards dataset changes, we observed that the systems that
are the more robust to the evolution of test collection were not the best performing ones.</p>
          <p>Further evaluations will be carried out in the near future with the manual assessment of the pooled
sets. A thorough analysis of the results will be necessary to study the impact of queries on the results
(their nature, topic, dificulty, etc.). Further analysis work will be necessary to fully establish the
robustness of the systems and the specific impact of dataset evolution on the performances.</p>
        </sec>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Task 2 - Classification</title>
      <p>Stance detection, an essential task in natural language processing (NLP), involves identifying an author’s
position or attitude towards a particular topic or statement. This task goes beyond simple sentiment
analysis by requiring models to discern not just positive or negative sentiments but also the specific
stance (supporting/believer, opposing/denier, or neutral) towards a given target [19, 20].</p>
      <p>Comprehending the evolution of social media stances over time poses a significant challenge, a topic
that has gained recent interest in the AI and NLP communities but remains relatively unexplored. The
performance of social media stance classifiers is intricately linked to temporal shifts in language and
evolving societal attitudes toward the subject matter [21].</p>
      <p>In LongEval 2024, social media stance detection, a multi-label English classification task, takes center
stage, surpassing the complexity of the binary sentiment task in LongEval 2023 [22]. Our primary goal
is to assess the persistence of stance detection models in the dynamic landscape of social media posts.</p>
      <p>The evolving nature of language and social opinions adds an additional layer of complexity to
the challenges faced by text classifiers. Language undergoes continuous changes, reflecting shifts in
societal norms and opinions and the emergence of novel concepts and words. For instance, consider the
evolution of public opinion on climate change over the past two decades:
• Sentence from 2000: “Global warming is a theory that needs more proof; it’s not urgent.”
• Sentence from 2010: “Evidence for climate change is mounting, and we need to start taking
action.”
• Sentence from 2020: “Climate change is an undeniable crisis that requires immediate global
action.”</p>
      <p>The context over two decades in the above example shows that language and urgency surrounding
climate change have evolved from skepticism to an accepted crisis. Models not updated with recent
discussions and policy changes might fail to accurately capture the critical tone and terminology used
in current dialogues about the environment. Similarly, the rapid emergence of new vocabulary, as
witnessed with terms like COVID-19 [23], highlights the dynamic nature of language, presenting unique
challenges for text classifiers.
3.1. Description of the task
To assess the extent of the performance drop of models over shorter and longer temporal gaps, we
provided a comprehensive training dataset along with five testing sets. These testing sets include two
practice sets and three development sets. The shared competition aimed to stimulate the development
of classifiers that can efectively handle temporal variations and maintain performance persistence over
diferent time distances. Participants were expected to submit solutions for two sub-tasks, showcasing
their ability to address the challenges of temporal variations in performance. The shared task was in
turn divided into two sub-tasks:</p>
      <p>Sub-Task 1: Short-Term Persistence: In this sub-task, participants were tasked with developing
models that demonstrated performance persistence over short periods. Specifically, the models needed
to maintain their performance over a temporal gap between the within datasets and the short-term
datasets. This involved comparing the performance from the within-practice data (January 2010 to
December 2010) to the short-practice data (January 2014 to December 2014), a time gap of 4 years,
and from the within-dev data (January 2011 to December 2011) to the short-dev data (January 2015
to December 2015), a time gap of 4 years</p>
      <p>Sub-Task 2: Long-Term Persistence: This sub-task required participants to develop models that
maintained performance persistence over a longer period of time. The classifiers were expected to
mitigate performance drops over a temporal gap between the within time datasets and the
longterm datasets. This involved comparing the performance from the within-dev data (January 2011 to
December 2011) to the long-dev data (January 2018 to September 2019), a time gap of approximately 7
to 8 years.</p>
      <p>In addition to the main sub-tasks, participants were also asked to work on models that maintained
performance within the same temporal year of the training set, with the practice-within data covering
January 2010 to December 2010 and the within-dev data covering January 2011 to December 2011,
with no gap between them and the training set (time gap 0).
3.2. Dataset
In this section, we present the process of constructing our final annotated corpus for the task. The
large-scale Climate Change Twitter dataset was originally described in [24], Our primary focus will be
on climate change stance, time of the post (created at), and the textual content of the tweets, which
we will refer to as the CC-SD dataset. This CC-SD is large-scale, covering a span of 13 years and
containing a diverse set of more than 15 million tweets from various years. Using the BERT model to
annotated tweets, the CC-SD stance labels fall into three categories: those that express support for
the belief in man-made climate change (believer), those that dispute it (denier), and those that remain
neutral on the topic.</p>
      <p>The total sum of the categorized tweets over the entire time span are as follows: 11,292,424 tweets as
believers, 1,191,386 as deniers, and 3,305,601 as neutral, distributed across the timeline. The annotation
is performed using transfer learning with BERT as distant supervision based on another sentiment
climate change dataset 7 and, thus, can be easily manually annotated to improve its precision using
human in the loop.</p>
      <p>Data sampling. The dataset is first downsampled to ensure an equal number of instances for each
stance (neutral, denier, believer) within a specified date range, using the minimum stance count across
all selected months and years to avoid bias. This involves randomly sampling the same number of rows
for each stance, year, and month combination, ensuring balanced representation. The downsampled data
is then shufled and split into training, development, and practice sets, including short- and long-term
coverage, with any intersecting IDs between these sets being removed to maintain data integrity and
prevent data leakage. Finally, a summary of the downsampled data is generated, detailing the number
of rows, date and time of sampling, and statistics per year and month.</p>
      <p>Test set annotation. We annotate our test data using Prolific 8, which is a high quality data collection
and annotation platform. The forms that contain data to annotate are created using Qualtrics9. We
run the annotation in several batches, and provide the annotation guideline stating the task details
and guidelines for the participants to follow. We add several filters, automatic and manual to select
the optimal demographic and qualified annotators. Additionally, a manual annotation is also enforced
which contains 5 tweets from the training set, which the organisers first annotate and then using the
majority annotation is released as qualification task. The participant have to correctly answer 4 out of 5
questions to access the actual annotation task. We also provide fields in our form for every annotator to
give their feedback and to point out if any tweet is inappropriate or contains explicit content in it. We
collect responses from 5 annotators for each tweet, and select the majority annotation from the five
annotation. In some cases, we find equal agreement among the annotators, and for those cases, we run
an extra round of annotation to finalise the agreement. Finally after cleanup and majority annotation
ifnding process, we manually check the data and divide into their respective splits.</p>
      <p>The resulting distribution of data is shown in Table 6. table Dataset statistics summary of training,
practice and testing sets.</p>
      <p>In the Practice phase, participants undertake Pre-Evaluation tasks with datasets from 2010 and
2014, sampled from CC-SD, allowing them to practice within a recent time frame and over a short
duration. These datasets are manually verified. Additionally, human-annotated "within time" and "short
time" practice sets are provided, also sampled from CC-SD, to refine model development before formal
evaluation.</p>
      <p>
        Subsequently, the Evaluation phase assesses models using datasets from 2011, 2015, and the longer
period of 2018-2019, all sampled from CC-SD. These datasets undergo manual verification and
encompass within-timeframe assessments, short-term predictions, and long-term predictions, ofering a
7https://www.kaggle.com/datasets/edqian/twitter-climate-change-sentiment-dataset
8https://www.prolific.com/
9https://www.qualtrics.com/
holistic evaluation of model performance across various temporal contexts. By incorporating datasets
covering diferent years, the evaluation ensures thorough testing and understanding of models’ temporal
persistence and performance.
3.3. Evaluation
Evaluation metrics for this edition of the task remain consistent with the previous version [
        <xref ref-type="bibr" rid="ref3">3, 25</xref>
        ]. All
submissions were assessed using two key metrics: the macro-averaged F1-score on the corresponding
sub-task’s development set and the Relative Performance Drop (RPD), calculated by comparing
performance on "within time" data against results from short- or long-term distant development
sets. Submissions for each sub-task were ranked primarily based on the macro-averaged F1-score.
Additionally, a unified score, the weighted-F1, was computed between the two sub-tasks, encouraging
participants to contribute to both for accurate placement on a collective leaderboard and a deeper
analysis of their system’s performance in various settings.
      </p>
      <p>Participants were expected to design an experimental architecture to enhance a text classifier’s
temporal performance. In such, the performance of the submissions was evaluated in two ways:
1. Macro-averaged F1-score: This metric measured the overall F1-score on the testing set for
the sentiment classification sub-task. The F1-score combines precision and recall to provide a
balanced measure of model performance. A higher F1-score indicated better performance in
terms of both positive and negative sentiment classification.
2. Relative Performance Drop (RPD): This metric quantified the diference in performance
between the "within-period" data and the short- or long-term distant testing sets. RPD was
computed as the diference in performance scores between two sets. A negative RPD value
indicated a drop in performance compared to the "within-period" data, while a positive value
suggested an improvement.
(1)
(2)
Where 0 represents performance when the time gap is 0, and  represents performance when
the time gap is short or long, as introduced in previous work [26].</p>
      <p>The submissions were ranked primarily based on the macro-averaged F1-score, emphasizing the
overall performance of the stance detection model on the testing sets. The higher the macro-averaged
F1-score, the higher the ranking of the submission.
3.4. Models
In our study, we evaluated several baseline classifiers to assess their performance and temporal
persistence when exposed to evolving data. The models we focused on include bert-base-uncased,
roberta-base, and their respective variations with additional continual incremental pretraining from
the climate change corpus.</p>
      <p>To address the challenges posed by evolving data, we implemented continual incremental pretraining
for both bert-base-uncased and roberta-base models. These variations, referred to as ++MLM 2019,
were further pretrained on a climate change corpus that covers data from the initial training year up to
2019 using masked language modeling. This approach aimed to incorporate recent linguistic trends and
contextual information, enhancing the models’ ability to adapt to new and evolving data.
macro =
2 · precision · recall
precision + recall
  =
score − score0
score0</p>
      <p>The dataset is segmented by years, starting from 2006 to various end years (2011, 2013, 2015, 2017,
2019). For each end year, data from all preceding years up to that point is aggregated and preprocessed.
Preprocessing includes filling missing values with the most frequent value in each column, removing
rows with missing values in the ’text’ or ’stance’ columns, and eliminating duplicate entries. Text data is
normalized to lowercase, and entries with fewer than six words are excluded. Post-processing, the data is
merged into a single dataset for each end year, resulting in five datasets representing diferent temporal
spans. These datasets are subsequently balanced by downsampling to ensure uniform representation
for incremental training.</p>
      <p>Using a masked language modeling strategy, the textual data without its label is fed into the models
incrementally in their chronological order, starting with the 2011 sample and ending with the 2019
sample. This approach ensures a balanced and clean dataset, facilitating robust analysis and model
training. Each model was incrementally tested to evaluate its persistence over time, and the best
performance was reported in the results section.</p>
      <p>• bert-base-uncased (Bidirectional Encoder Representations from Transformers) [27] is a
foundational model in NLP that introduced the concept of bidirectional training of transformers
for language modeling. The bert-base-uncased model is a version of BERT that ignores case
sensitivity, which helps in learning case-independent features. It also consists of 12 transformer
layers, 768 hidden units, and 12 attention heads. BERT uses a static masked language modeling
objective during pretraining, which involves predicting masked words in a sentence based on
their context.
• roberta-base (Robustly optimized BERT approach) [28] is a variant of the BERT model designed
to improve performance by optimizing the pretraining process. It uses dynamic masking, a larger
batch size, and more data to enhance the training of transformer-based models. The roberta-base
model consists of 12 transformer layers, 768 hidden units, and 12 attention heads. It is pretrained
on a diverse range of data to capture rich contextual representations, making it efective for
various NLP tasks.
• ++MLM 2019: A masked language modeling strategy used to adapt a language model to new data
by incrementally pretraining with an unlabeled corpus up to 2019. This method leverages recent
linguistic trends and contextual updates to improve model adaptation and performance over time.</p>
      <p>This systematic approach allowed us to evaluate and enhance the models’ temporal persistence and
robustness baselines, ensuring they remain efective in the face of evolving language patterns.
3.5. Results
3.6. Practice phase
This section presents the results obtained during both the practice and evaluation phases of task 2.
In this subsection, we present the results of the practice phase of task 2. This practice dataset was
provided to participants to allow them to practice and initiate their text classifiers. Since we did not
get any submissions and to understand the initial performance of our practice sets, we compared
several baseline classifiers. The models evaluated include roberta-base, bert-base-uncased, and their
respective variations with additional continual incremental pretraining from the climate change corpus
from the initial year of training up to 2019 using masked lanague modeling. The results are summarized
in Table 7.</p>
      <p>As it can be seen from Table 7, the results indicate that the ++MLM 2019 variations of both
robertabase and bert-base-uncased demonstrate improved f-Within and f-Avg scores compared to their
original counterparts. This suggests that additional continual pretraining based on recent data,
incrementally over time, contributes to better performance persistence. Notably, bert-base-uncased
++MLM 2019 achieved the lowest RPD, highlighting its resilience to temporal changes.
3.7. Evaluation phase
In this subsection, we present the results of the evaluation phase of task 2. Using the development
dataset provided to participants, we evaluated the final performance of the text classifier models. To
understand the performance of our development sets, we compared several baseline classifiers due to
the lack of submissions. The models evaluated include roberta-base, bert-base-uncased, and their
respective variations with additional continual incremental pretraining from the climate change corpus
up to 2019 using masked language modeling. The results are summarized in Table 8.</p>
      <p>As shown in Table 8, the ++MLM 2019 variations of both roberta-base and bert-base-uncased
models exhibit notable improvements in the f-Short and f-Long scores, as well as reduced RPD values
compared to their standard counterparts. The ++MLM 2019 variation of roberta-base achieved an f-Avg
score of (0.590), an improvement over the original model’s score of (0.571). It also showed a significantly
lower RPD-Short of (-4.74%) and RPD-Long of (-11.46%), indicating better resilience to temporal changes
over both short and long gaps. Similarly, the ++MLM 2019 variation of bert-base-uncased achieved
an f-Avg score of (0.570), slightly lower than the original model’s 0.573. However, it exhibited a lower
RPD-Long of (-10.01%) and RPD-Avg of (-14.94%), demonstrating improved performance persistence
over time.</p>
      <p>These results reinforce the value of continual incremental pretraining with recent data to maintain
and improve model performance in dynamic environments. The ++MLM 2019 variations consistently
showed enhanced performance metrics and reduced performance degradation over time, validating the
efectiveness of this approach in enhancing temporal persistence.
3.8. Discussion and conclusion
This section discusses the results of our study on temporally adaptive classification methods, highlighting
the significance of incorporating temporal information into text classification models to mitigate
performance drops over time and the use of an outdated language model. These results reveal that
classifiers trained on older data exhibit significant performance drops when applied to newer data.
This is evident from the relative performance drops (RPD) reported, where the ++MLM 2019 variations
showed a marked improvement in mitigating this drop.</p>
      <p>Previous work by Alkhalifa et al. [26] introduced the Incremental Temporal Alignment (ITA) method as
a superior approach for enhancing temporal persistence of static word embedding. This method aligns
closely with the continual incremental pretraining approach evaluated in our results, where ++MLM
2019 variations of both roberta-base and bert-base-uncased demonstrated improved f-Within, f-Avg
scores, and lower RPD values. The ITA method’s emphasis on leveraging incremental updates to word
embeddings aligns with the improvements seen in the ++MLM 2019 models, showcasing their resilience
to evolving data and enhancing their persistence as text classifiers as context updated overtime.</p>
      <p>The results reinforce several best practices for designing temporally robust and persistent text
classiifers. Methods relying on incremental updates generally outperform static embeddings, as corroborated
by the superior performance of the ++MLM 2019 models. Additionally, it is crucial to select robust
baseline models and incrementally update them to accommodate evolving language patterns over time.</p>
      <p>The practical implications of our findings are significant for real-world NLP applications. In dynamic
environments such as stance posts on social media, language evolves rapidly, making temporal
adaptation through an incremental pretraining approach substantially enhance the longevity and persistence
of text classifiers. These results provide empirical evidence supporting the implementation of temporally
adaptive classification methods in real-world scenarios.</p>
    </sec>
    <sec id="sec-4">
      <title>Acknowledgments</title>
      <p>This work is supported by the ANR Kodicare bi-lateral project, grant ANR-19-CE23-0029 of the French
Agence Nationale de la Recherche, and by the Austrian Science Fund (FWF, grant I4471-N). This work
is also supported by a UKRI/EPSRC Turing AI Fellowship to Maria Liakata (grant no. EP/V030302/1).
This work has been using services provided by the LINDAT/CLARIAH-CZ Research Infrastructure
(https://lindat.cz), supported by the Ministry of Education, Youth and Sports of the Czech Republic
(Project No. LM2023062) and has been also supported by the Ministry of Education, Youth and Sports
of the Czech Republic, Project No. LM2023062 LINDAT/CLARIAH-CZ.
Conference of the CLEF Association (CLEF 2024), Lecture Notes in Computer Science (LNCS),
Springer, Heidelberg, Germany, 2024.
[5] P. Galuščáková, R. Deveaud, G. Gonzalez-Saez, P. Mulhem, L. Goeuriot, F. Piroi, M. Popel,
Longevalretrieval: French-english dynamic test collection for continuous web search evaluation, 2023.
arXiv:2303.03229.
[6] K. Song, X. Tan, T. Qin, J. Lu, T.-Y. Liu, Mpnet: Masked and permuted pre-training for language
understanding, Advances in neural information processing systems 33 (2020) 16857–16867.
[7] H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix, B. Rozière, N. Goyal,
E. Hambro, F. Azhar, et al., Llama: Open and eficient foundation language models, arXiv preprint
arXiv:2302.13971 (2023).
[8] A. Q. Jiang, A. Sablayrolles, A. Roux, A. Mensch, B. Savary, C. Bamford, D. S. Chaplot, D. d. l. Casas,</p>
      <p>E. B. Hanna, F. Bressand, et al., Mixtral of experts, arXiv preprint arXiv:2401.04088 (2024).
[9] A. Basaglia, A. Stocco, M. Popović, N. Ferro, Seupd@clef: Team dam on reranking using sentence
embedders, in: [29], 2024.
[10] L. Cazzador, F. L. D. Faveri, F. Franceschini, L. Pamio, S. Piron, N. Ferro, Seupd@clef: Team mouse
on enhancing search engines efectiveness with large language models, in: [29], 2024.
[11] F. Galli, M. Rigobello, M. Schibuola, R. Zuech, N. Ferro, Seupd@clef: Team iris on temporal
evolution of query expansion and rank fusion techniques applied to cross-encoder re-rankers, in:
[29], 2024.
[12] J. Keller, T. Breuer, P. Schaer, Leveraging prior relevance signals in web search, in: [29], 2024.
[13] S. Yoon, J. Kim, S. won Hwang, Analyzing the efectiveness of listwise reranking with positional
invariance on temporal generalizability, in: [29], 2024.
[14] A. Kimia, A. Akan, F. Arwa, N. Ferro, Seupd@clef: Team kalu on improving search engine
performance with query expansion and re-ranking approach, in: [29], 2024.
[15] D. Alexander, M. Fröbe, G. Hendriksen, F. Schlatt, M. Hagen, D. Hiemstra, M. Potthast, A. P.</p>
      <p>de Vries, Team openwebsearch at clef 2024: Longeval, in: [29], 2024.
[16] M. Gründel, M. Weber, J. Franke, J. H. Reimer, Team galápagos tortoise at longeval 2024: Neural
re-ranking and rank fusion for temporal stability, in: [29], 2024.
[17] F. Giachelle, O. Irrera, G. Silvello, Doctag: A customizable annotation tool for ground truth
creation, in: Advances in Information Retrieval: 44th European Conference on IR Research, ECIR
2022, Stavanger, Norway, April 10–14, 2022, Proceedings, Part II, volume 13186 of Lecture Notes in
Computer Science, Springer, 2022, pp. 288–293.
[18] D. Harman, TREC-Style Evaluations, Springer Berlin Heidelberg, Berlin, Heidelberg, 2013, pp. 97–
115. URL: https://doi.org/10.1007/978-3-642-36415-0_7. doi:10.1007/978-3-642-36415-0_7.
[19] D. Küçük, F. Can, Stance detection: A survey, ACM Comput. Surv. 53 (2020). URL: https://doi.org/
10.1145/3369026. doi:10.1145/3369026.
[20] S. M. Mohammad, P. Sobhani, S. Kiritchenko, Stance and sentiment in Tweets, ACM
Transactions on Internet Technology 17 (2017). URL: http://alt.qcri.org/semeval2016/task6/. doi:10.1145/
3003433. arXiv:1605.01655.
[21] R. Alkhalifa, A. Zubiaga, Capturing stance dynamics in social media: open challenges and research
directions, International Journal of Digital Humanities (2022) 1–21.
[22] R. Alkhalifa, I. Bilal, H. Borkakoty, J. Camacho-Collados, R. Deveaud, A. El-Ebshihy, L.
EspinosaAnke, G. Gonzalez-Saez, P. Galuščáková, L. Goeuriot, E. Kochkina, M. Liakata, D. Loureiro, H.
Tayyar Madabushi, P. Mulhem, F. Piroi, M. Popel, C. Servan, A. Zubiaga, Longeval: Longitudinal
evaluation of model performance at clef 2023, in: J. Kamps, L. Goeuriot, F. Crestani, M. Maistro,
H. Joho, B. Davis, C. Gurrin, U. Kruschwitz, A. Caputo (Eds.), Advances in Information Retrieval,
Springer Nature Switzerland, Cham, 2023.
[23] R. Alkhalifa, T. Yoong, E. Kochkina, A. Zubiaga, M. Liakata, QMUL-SDS at checkthat! 2020:
Determining COVID-19 tweet check-worthiness using an enhanced CT-BERT with numeric expressions,
CoRR abs/2008.13160 (2020). URL: https://arxiv.org/abs/2008.13160. arXiv:2008.13160.
[24] D. Efrosynidis, A. I. Karasakalidis, G. Sylaios, A. Arampatzis, The climate change twitter dataset,
Expert Systems with Applications 204 (2022) 117541. URL: https://www.sciencedirect.com/science/
article/pii/S0957417422008624. doi:https://doi.org/10.1016/j.eswa.2022.117541.
[25] R. Alkhalifa, I. M. Bilal, H. Borkakoty, Romain, Deveaud, A. El-Ebshihy, Luis, Espinosa-Anke,
Gabriela, Gonzalez-Saez, P. Galuscáková, L. Goeuriot, E. Kochkina, M. Liakata, D. Loureiro, P.
Mulhem, F. Piroi, M. Popel, C. Servan, H. T. Madabushi, Arkaitz, Zubiaga, Extended overview
of the clef-2023 longeval lab on longitudinal evaluation of model performance, 2023. URL:
https://api.semanticscholar.org/CorpusID:259953335.
[26] R. Alkhalifa, E. Kochkina, A. Zubiaga, Opinions are made to be changed: Temporally adaptive
stance classification, in: Proceedings of the 2021 Workshop on Open Challenges in Online Social
Networks, 2021, pp. 27–32.
[27] J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers
for language understanding, in: Proceedings of the 2019 Conference of the North American Chapter
of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long
and Short Papers), 2019, pp. 4171–4186.
[28] Y. Liu, M. Ott, N. Goyal, J. Du, M. Joshi, D. Chen, O. Levy, M. Lewis, L. Zettlemoyer, V. Stoyanov,</p>
      <p>Roberta: A robustly optimized bert pretraining approach, arXiv preprint arXiv:1907.11692 (2019).
[29] G. Faggioli, N. Ferro, P. Galuščáková, A. G. S. de Herrera (Eds.), Proceedings of Working Notes of
CLEF 2024 - Conference and Labs of the Evaluation Forum, CEUR Workshop Proceedings, Aachen,
2024.
ABYSS_BM25-French-Stop50_40FR_10EN-SnowStem-Dict-Fuzzy-Phrase-Start-Synonyms-RR
ABYSS_BM25-French-Stop50_40FR_10EN-SnowStem-Fuzzy-Phrase-Start
ABYSS_BM25-French-Stop50_40FR_10EN-SnowStem-Fuzzy-Phrase-Start-RR
CIR_BM25
CIR_BM25+monoT5
CIR_BM25+qrel_boost
CIR_BM25+RF
CIR_BM25+time_boost
galapagos-tortoise-bm25-bo1-pl2-monot5-kmax-avg-k-4
galapagos-tortoise-bm25-bo1-pl2-monot5-max
galapagos-tortoise-bm25-bo1-pl2-monot5-mean
galapagos-tortoise-rank-zephyr
galapagos-tortoise-wsum
KALU_MISTRAL_FRENCH
KALU_RERANK_HARMONIC_MISTRAL_FRENCH
KALU_RERANK_HARMONIC_MISTRAL_FRENCH_SHOULD
KALU_RERANK_SIMPLE_FRENCH_LLAMA
KALU_RERANK_SIMPLE_MISTRAL_FRENCH
ows_bm25_bo1_keyqueries
ows_bm25_reverted_index
ows_ltr_all
ows_ltr_wows_all_rerank
ows_ltr_wows_base_rerank
ows_ltr_wows_rerank_and_keyquery
ows_ltr_wows_rerank_and_reverted_index
Quokkas_french-letter-lightstem
Quokkas_french-standard-lightstem
seupd2324-dam_EN-Stop-SnowBall-Poss-Prox(50)
seupd2324-dam_EN-Stop-SnowBall-Poss-Prox(50)-Reranking(200)
seupd2324-dam_FR-Stop-FrenchLight-Elision-ICU-Prox(50)
seupd2324-dam_FR-Stop-FrenchLight-Elision-ICU-Prox(50)-Reranking(150)
seupd2324-dam_FR-Stop-FrenchLight-Elision-ICU-Shingles-Prox(50)-Reranking(150)
seupd2324-iris_FR_GFF@12_w0.162_MMARCO@1000_ADD_w5
seupd2324-iris_FR_GFF@12_w0.162_MMARCO@1000_MAXMIN_ADD_w5
seupd2324-iris_FR_MMARCO@1000_ADD_w5
seupd2324-iris_FR_url_w1.4_GFF@12_w0.162_MMARCO@1000_ADD_w5
seupd2324-iris-FR_Q2K@1_w0.16_MMARCO@1000_MAXMIN_ADD_w5
seupd2324-lfzzo-englishSystem1
seupd2324-lfzzo-englishSystem2
seupd2324-lfzzo-englishSystem3
seupd2324-lfzzo-englishSystem4
seupd2324-lfzzo-englishSystem5
seupd2324-lfzzo-frenchSystem1
seupd2324-lfzzo-frenchSystem2
seupd2324-lfzzo-frenchSystem3
seupd2324-lfzzo-frenchSystem4
seupd2324-lfzzo-frenchSystem5
seupd2324-mouse_English_Porter_Standard_NoStop_Mixtral-8x7b_NoRerank
seupd2324-mouse_English_Porter_Standard_stopwords-en_LLama3-70b_NoRerank
seupd2324-mouse_English_Porter_Standard_top125_LLama3-70b_Cohere-100-w06
seupd2324-mouse_English_Porter_Standard_top125_LLama3-70b_Pygaggle-Luyu-20-w06
seupd2324-mouse_English_Porter_Standard_top125_Mixtral-8x7b_Pygaggle-Luyu-20-w06
seupd2324-mouse_French_FrenchLight_Standard_NoStop_Mixtral-8x7b_NoRerank
seupd2324-mouse_French_FrenchLight_Standard_stopwords-fr_LLama3-70b_NoRerank
seupd2324-mouse_French_FrenchLight_Standard_top125_LLama3-70b_Cohere-100-w06
seupd2324-mouse_French_FrenchLight_Standard_top125_LLama3-70b_Pygaggle-Luyu-20-w06
seupd2324-mouse_French_FrenchLight_Standard_top125_Mixtral-8x7b_Pygaggle-Luyu-20-w06
seupd2324-seekx_LetLightFR
seupd2324-seekx_LetLightStopFR
seupd2324-seekx_LetLightStopSynFR
seupd2324-seekx_StanMinEN
seupd2324-seekx_StanMinSynEN
SNU_LDI_listt5
SNU_LDI_monot5
WONDER_BASELINE
WONDER_ENGLISH
WONDER_ENGLISH_FRENCH
WONDER_FRENCH
WONDER_TWOPHASE
XPLORE_French-BM25-FrenchLight-Stop
XPLORE_French-BM25-FrenchLight-Stop-SynonymMapper
XPLORE_French-BM25Default-FrenchLight-Stop
XPLORE_French-LMDirichlet-FrenchLight-Stop</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>R. Gangi</given-names>
            <surname>Reddy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Iyer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. A.</given-names>
            <surname>Sultan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Sil</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Castelli</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Florian</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Roukos</surname>
          </string-name>
          ,
          <article-title>Synthetic target domain supervision for open retrieval qa</article-title>
          ,
          <source>in: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval</source>
          , SIGIR '21,
          <string-name>
            <surname>Association</surname>
          </string-name>
          for Computing Machinery, New York, NY, USA,
          <year>2021</year>
          , p.
          <fpage>1793</fpage>
          -
          <lpage>1797</lpage>
          . URL: https://doi.org/10.1145/ 3404835.3463085. doi:
          <volume>10</volume>
          .1145/3404835.3463085.
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>J.</given-names>
            <surname>Lovón-Melgarejo</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Soulier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Pinel-Sauvagnat</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Tamine</surname>
          </string-name>
          ,
          <article-title>Studying catastrophic forgetting in neural ranking models</article-title>
          , Springer-Verlag, Berlin, Heidelberg,
          <year>2021</year>
          , p.
          <fpage>375</fpage>
          -
          <lpage>390</lpage>
          . URL: https: //doi.org/10.1007/978-3-
          <fpage>030</fpage>
          -72113-8_
          <fpage>25</fpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>030</fpage>
          -72113-8_
          <fpage>25</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          , I. Bilal,
          <string-name>
            <given-names>H.</given-names>
            <surname>Borkakoty</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Camacho-Collados</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Deveaud</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>El-Ebshihy</surname>
          </string-name>
          , L. EspinosaAnke,
          <string-name>
            <given-names>G.</given-names>
            <surname>Gonzalez-Saez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Goeuriot</surname>
          </string-name>
          , E. Kochkina,
          <string-name>
            <given-names>M.</given-names>
            <surname>Liakata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Loureiro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. T.</given-names>
            <surname>Madabushi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Mulhem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Piroi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Popel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Servan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>Overview of the clef-2023 longeval lab on longitudinal evaluation of model performance, in: Experimental IR Meets Multilinguality, Multimodality, and Interaction</article-title>
          .
          <source>Proceedings of the Fourteenth International Conference of the CLEF Association (CLEF 2023), Lecture Notes in Computer Science (LNCS)</source>
          , Springer, Thessaliniki, Greece,
          <year>2023</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Borkakoty</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Deveaud</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>El-Ebshihy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Espinosa-Anke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Fink</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Gonzalez-Saez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Goeuriot</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Iommi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Liakata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. T.</given-names>
            <surname>Madabushi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Medina-Alias</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Mulhem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Piroi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Popel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>Overview of the CLEF 2024 LongEval Lab on Longitudinal Evaluation of Model Performance</article-title>
          , in: L.
          <string-name>
            <surname>Goeuriot</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Mulhem</surname>
            ,
            <given-names>G.</given-names>
          </string-name>
          <string-name>
            <surname>Quénot</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Schwab</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Soulier</surname>
          </string-name>
          ,
          <string-name>
            <surname>G. M. D. Nunzio</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <surname>A. G. S. de Herrera</surname>
          </string-name>
          , G. Faggioli, N. Ferro (Eds.),
          <source>Experimental IR Meets Multilinguality</source>
          , Multimodality, and
          <string-name>
            <surname>Interaction</surname>
          </string-name>
          . Proceedings of the Fifteenth International
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>