<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-title-group>
        <journal-title>Conference and Labs of the Evaluation Forum, September</journal-title>
      </journal-title-group>
    </journal-meta>
    <article-meta>
      <title-group>
        <article-title>Extended Overview of the CLEF-2023 LongEval Lab on Longitudinal Evaluation of Model Performance</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Rabab Alkhalifa</string-name>
          <xref ref-type="aff" rid="aff3">3</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Iman Bilal</string-name>
          <xref ref-type="aff" rid="aff10">10</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hsuvas Borkakoty</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jose Camacho-Collados</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Romain Deveaud</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alaa El-Ebshihy</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Luis Espinosa-Anke</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Gabriela Gonzalez-Saez</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Petra Galuščáková</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Lorraine Goeuriot</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Elena Kochkina</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Maria Liakata</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff10">10</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Daniel Loureiro</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Philippe Mulhem</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Florina Piroi</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Martin Popel</string-name>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christophe Servan</string-name>
          <xref ref-type="aff" rid="aff4">4</xref>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Harish Tayyar Madabushi</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Arkaitz Zubiaga</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>iman.bilal@warwick.ac.uk (I. Bilal)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>borkakotyh@cardif.ac.uk (H. Borkakoty)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>r.deveaud@qwant.com (R. Deveaud)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>alaa.el-ebshihy@tuwien.ac.at (A. El-Ebshihy)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>espinosa-ankel@cardif.ac.uk (L. Espinosa-Anke)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>e.kochkina@qmul.ac.uk (E. Kochkina)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>m.liakata@qmul.ac.uk (M. Liakata)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>boucanovaloureirod@cardif.ac.uk (D. Loureiro)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Philippe.Mulhem@imag.fr (P. Mulhem)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>lorina.piroi@researchstudio.at (F. Piroi)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>popel@ufal.mf.cuni.cz (M. Popel)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>c.servan@qwant.com (C. Servan)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>@bath.ac.uk (H. Tayyar Madabushi)</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>a.zubiaga@qmul.ac.uk (A. Zubiaga) ©</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>International (CC BY</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>). CPWrEooUr</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>oinpgs IhStpN:/</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>g CEUR Workshop Pro</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>ings (CEUR-WS.org)</string-name>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Alan Turing Institute</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Cardif University</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Charles University</institution>
          ,
          <addr-line>Prague</addr-line>
          ,
          <country country="CZ">Czech Republic</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Imam Abdulrahman Bin Faisal University</institution>
          ,
          <addr-line>SA</addr-line>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Paris-Saclay University</institution>
          ,
          <addr-line>CNRS, LISN</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Queen Mary University of London</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Qwant</institution>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>Research Studios Austria, Data Science Studio</institution>
          ,
          <addr-line>Vienna, AT</addr-line>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>Univ. Grenoble Alpes, CNRS, Grenoble INP (Institute of Engineering Univ. Grenoble Alpes.)</institution>
          ,
          <addr-line>LIG, Grenoble</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>University of Bath</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff10">
          <label>10</label>
          <institution>University of Warwick</institution>
          ,
          <country country="UK">UK</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2023</year>
      </pub-date>
      <volume>1</volume>
      <fpage>8</fpage>
      <lpage>21</lpage>
      <abstract>
        <p>We describe the first edition of the LongEval CLEF 2023 shared task. This lab evaluates the temporal persistence of Information Retrieval (IR) systems and Text Classifiers. Task 1 requires IR systems to run on corpora acquired at several timestamps, and evaluates the drop in system quality (NDCG) along these timestamps. Task 2 tackles binary sentiment classification at diferent points in time, and evaluates the performance drop for diferent temporal gaps. Overall, 37 teams registered for Task 1 and 25 for Task 2. Ultimately, 14 and 4 teams participated in Task 1 and Task 2, respectively.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;Evaluation</kwd>
        <kwd>Temporal Persistence</kwd>
        <kwd>Temporal Generalisability</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        Time is a dimension that is often overlooked when conducting Information Retrieval (IR)
experiments, especially when static data sets are utilized. Some data sets, like CORD19, are
collected at diferent points in time, showing diferences in the set of documents from one
collection time to another. Recent research [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ] has demonstrated models trained on data
pertaining to a particular time period struggle to keep their performance levels when applied
on test data that is distant in time.
      </p>
      <p>With the aim of tackling this challenge of making models have persistent quality over time, the
objective of the LongEval lab is twofold: (i) to explore the extent to which temporal diferences
over time, as reflected in the evolution of evaluation datasets, results in the deterioration of the
performance of information retrieval and classification systems, and (ii) to propose improved
methods that mitigate performance drop by making models more robust over time.</p>
      <p>
        The LongEval lab [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ] took place as part of the Conference and Labs of the Evaluation Forum
(CLEF) 2023, and consisted in two separate tasks: (i) Task 1, focused on information retrieval,
and (ii) Task 2, focused on text classification for sentiment analysis. Both tasks provided labeled
datasets enabling analysis and evaluation of models over longitudinally evolving data.
      </p>
      <p>
        In this paper, we add details to [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ], by focusing on the datasets statistics, and on analysing in
details the overal partyicipant runs and results for each task.
      </p>
    </sec>
    <sec id="sec-2">
      <title>2. Task 1 - Retrieval</title>
      <p>The goal of the retrieval task is to explore the efect of changes in datasets on retrieval of text
documents. More specifically, we focus on a setup in which the datasets are evolving. This
means, that one dataset can be acquired from another by adding, removing (and replacing) a
limited number of documents and queries. We explore two main scenarios and the setup of the
task thus reflects the details of these two problems.</p>
      <sec id="sec-2-1">
        <title>A single system in an evolving setup</title>
        <p>We explore how one selected system behaves if we evaluate it using several collections, which
evolve along the time. Specifically, we explore the efect of changes in datasets on retrieval
performances in a Web search domain. In this domain, the documents, queries and also the
perception of relevance naturally continuously evolves and Web search engines need to deal
with this situation. The evaluation in this scenario is thus very specific and should take into
account the evolving nature of the data. Evaluation should ideally reflect the changes in the
collection and especially signal substantial changes that could lead to performance drop.This
would allow to re-train the search engine model then and only when it is really necessary, and
enable much more eficient overall training.</p>
        <p>This problem emerges also with the popularity of neural networks. The stability of the
performance of the neural networks seems to be lower than in the case of the statistical model.
Moreover, the performance strongly depends on the data used for training the neural model. One
objective of the task is to explore the behavior of the neural system in the evolving data scenario.</p>
      </sec>
      <sec id="sec-2-2">
        <title>Comparison of multiple systems in an evolving setup</title>
        <p>While in the first point, we explore a single system, comparison of this systems with multiple
systems across evolving collections, should provide more information about systems stability
and robustness.</p>
        <sec id="sec-2-2-1">
          <title>2.1. Description of the task</title>
          <p>The task datasets were created over sequential time periods, which allows doing observations
at diferent time stamps , and most importantly, comparing the performance across diferent
time stamps  and ′. Two sub-tasks are organized as follows:
A) Short-term (ST) Persistence task that aim to assess the performance diference between 
and ′ when ′ occurs right after or shortly after 
B) Long-term (LT) Persistence task that aim to examine the performance diference between
two  and ′′, when ′′ occurs several months after  (and thus |′′ − | &gt; |′ − |).
In addition to this, we provide Within-time (WT) dataset, which contains the same documents
(but diferent queries) as the training data. This data are used as a control group and applied to
measure a change against the training data.</p>
        </sec>
        <sec id="sec-2-2-2">
          <title>2.2. Dataset</title>
          <p>
            Data for this task were provided by the French search engine Qwant. They consist of the queries
issued by the users of this search engine, cleaned Web documents, which were 1) selected to
correspond to the queries, and 2) to add additional noise, and relevance judgments, which were
created using a click model. The dataset is fully described in Galuščáková et al. [
            <xref ref-type="bibr" rid="ref3">3</xref>
            ]. We provided
training data, which included 672 train queries, with corresponding 9,656 assessments and
1,570,734 Web pages. In addition to this, the training data included the 98 heldout WT queries.
All training and heldout data were collected during June 2022. Test data were split into two
collections, each corresponding to a single sub-task. The data for the short-term persistence
sub-task was collected over July 2022 and this dataset contains 1,593,376 documents and 882
queries. The data for the long-term persistence sub-task was collected over September 2022 and
this dataset consists of 1,081,334 documents and 923 queries. All the datasets are freely available
at Lindat/Clarin. The data we collected is mostly in French therefore, to help participants, the
LongEval data set for the Retrieval task also contains automatic translations into English of
both queries and documents.
          </p>
          <p>The document and query overlap ratios between the collections is given by Table 1 and
Table 2. Queries for the Heldout collections were selected not to overlap with the Train queries
and the these two collections share all the documents. The overlap between the Heldout/Train
collection is surprisingly high, especially in terms of documents.</p>
          <p>To evaluate the submissions we use two diferent sets of relevance judgments: a) the judgments
acquired by the click model, based on the raw clicks of the users; and b) manual relevance
judgment on a pooled query subset. As the manual evaluations are ongoing, in this paper
we only report the relevance judgments acquired from the click model. For evaluating both
subtasks, we use the NDCG measure (calculated for each dataset), as well as the drop between
the ST and LT collection against the training data (WT collection).
2.3. Submissions
14 teams submitted their systems to the Retrieval task. 12 of these teams submitted the results
into both Short-term and Long-term retrieval sub-tasks, two teams only submitted the results
for the Short-term retrieval sub-tasks. As per the requirements, all participating teams needed to
submit their systems also on the within-time dataset, which was created at the same dataframe as
the training data, which allows measuring relative drop between the datasets. All teams, except
one, which submitted 4 systems, decided to submit 5 systems. Together, with 4 baseline runs
provided by the Université Grenoble Alpes (marked as UGA), this creates a pool of 73 systems
available on the within-time (WT, corresponding to the Heldout queries runs on the Train
corpus) and short-term (ST) collections and 63 systems available on the long-term collection.</p>
        </sec>
        <sec id="sec-2-2-3">
          <title>2.4. Absolute Scores</title>
          <p>Table 3 gives the overview of NDCG and MAP scores for each submitted run on diferent
datasets (WT, ST, LT). For each run, the columns of the table indicate which language was used
(English, French, or both), whether any neural approach was involved (values yes/no), and
whether a single or a combination of several approaches was used (values yes/no). We show
NDCG score histograms for these runs, in decreasing order, for each dataset, showing whether
a run uses any neural approach (green for yes, yellow for no) in Figure 1, and whether the run
uses a combination of more than a single approach (orange for yes, cyan for no) in Figure 2, for
both WT and ST collections. This information was acquired from the participants through a
questionnaire the participants had to fill for each submitted run. Figure 3 shows which language
each made use of.</p>
          <p>From Table 3 we see that the systems which did best for the WT data are also among the
top for the ST and LT datasets. For instance, the best system on WT, according to the NDCG
measure, (FADERIC_Fr-BM25-S50-LS-S-F-SC-R20W6), is ranked best also on ST, and considering
(a) WT Dataset
(b) ST Dataset
the systems that obtained a non-zero evaluation for the two tasks, the best system considering
NDCG on WT, SQUID_SEARCHERAI, is also the best on ST and LT datasets. This finding does
not hold for the MAP measures: considering the systems that participated to the two tasks,
the best system for MAP in WT, CLOSE_SBERT_BM25, is the second best on the ST dataset
and the fourth best on the LT dataset. An explanation may come from the fact that the NDCG
emphasizes on the top ranked documents of the runs.</p>
          <p>We describe now the methods used in the top-3 runs, according to the NDCG evaluation
1. CLOSE_SBERT_BM25 from the CLOSE team: The system uses query variant generated
from GPT using dedicated prompts, and applies sentence BERT to rerank the initial BM25
results.
2. gwca_lightstem-phrase-qexp from de GWCA team: this systems uses a French stoplist
and stemmer, a query expression is composed of the original text, phrases extracted from
the query, and text generated using GPT 3.5.
3. SQUID_SEARCHERAI from the Squid team: this systems relies on Lucene indexing and
searcher on French documents and queries. It uses several fields for the documents
(title/url/body) with diferent boost values, and expands the queries with synonyms from
GPT 3.5.</p>
          <p>For the ST Dataset, the top-3 systems are:
1. FADERIC_Fr-BM25-S50-LS-S-F-SC-R20W6 from the FADERIC team. The matching is
based on BM25, fine-tuned on the training set. The query processing use the Lucene
fuzzy matching, able to allow partial match of words, and integrate synomyms expansion.
A reranking fuses linearly the BM25 scores and BERT for the 20 top BM25 documents.
Though the runs from the FADERIC team achieve the highest NDCG scores on the ST
collection, unfortunately the scores achieved on the LT collection is zero, presumably due
to an error.
2. FADERIC_Fr-BM25-S50-LS-S-F-R30 from the FADERIC team. This run is similar to the
one above, the diferences rely on the number of document reranked (here 30) and a
diferent weight of BM25 score in the linear combination.
3. SQUID_SEARCHERAI from the Squid team, already described above.</p>
          <p>For the LT Dataset, the top-3 systems are:
1. CLOSE_SBERT_BM25 from the CLOSE team, already described;
2. SQUID_W2V from the Squid team: this system relies on Lucene indexing and searcher on
french documents and queries. It uses several fields for the documents (title/url/body)
with diferent boost values, and expands the queries with word2Vec similar terms.
3. SQUID_SEARCHERAI from the Squid team, already described above.</p>
          <p>Thus, the best approaches all rely to some extent on query expansion techniques, and integrate
at one point or another embeddings or Large Language Models. The best results use French
documents and queries. The efect of the translation provided by the lab has a clear impact.
This remark is exemplified by the UGA baselines: the UGA_BM25_French outperforms the
UGA_BM25_English default, and similarly the reranking using T5 French run (UGA_T5_French)
outperforms its English counterpart (UGA_T5_English).</p>
          <p>Considering the Figures 2 and 3, we see that the shape of the distribution of the NDCG values
are similar for the WT and ST datasets. However, the best systems have higher performances
on WT than on ST: 13 runs on the WT dataset are above 0.4, while only 7 on the ST dataset.</p>
          <p>Neural</p>
          <p>Comb.</p>
        </sec>
        <sec id="sec-2-2-4">
          <title>2.5. Changes in the Scores</title>
          <p>The main part of the task is to see the changes in the scores between the collections. All
collections were created using the same approach and procedure and have a high overlap in
terms of both queries and documents. In Table 4, we thus provide the relative drops between
the collections ST and WT and between the collections LT and WT. The definition of the value
“WT-ST” NDCG change is defined, for a run  as:
For “WT-LT” the formula is:</p>
          <p>NDCG  ()− NDCG ()</p>
          <p>NDCG  ()
NDCG  ()− NDCG ()</p>
          <p>NDCG  ()</p>
          <p>With such definitions, large negative values for columns “WT-ST” and “WT-LT” mean that
the systems are able to generalize well on the new test collections, as the WT heldout queries
are processed on the same document corpus as the training data, which is not the case of the ST
and LT datasets.</p>
          <p>What we see in Table 4 is that the systems that are the more robust to the evolution of
test collection are not the top ones: for instance the NEON_3b run is almost at the bottom on
Table 5 but does increase its NDCG values at ST, as well as at LT. We also see that the best
systems according to NDCG at ST, FADERIC_Fr-BM25-S50-LS-S-F-SC-R20W6,
FADERIC_FrBM25-S50-LS-S-F-R30 and SQUID_SEARCHERAI, are stable or decreasing their NDCG values
at ST.</p>
          <p>On average (last line of Table 4), the systems increase less their results on ST than on LT,
which is surprising. This surprising point will need further explorations as it looks contradictory
to what we were expecting, as there are more diferences between WT and LT than between
ST and WT datasets (see Tables 1 and 2). Another element worth noticing is that the NDCG
changes WT-ST and WT-LT behave consistently: for most of the systems the absolute value for
WT-ST is smaller than the absolute value of WT-LT.</p>
        </sec>
        <sec id="sec-2-2-5">
          <title>2.6. Run Rankings</title>
          <p>We have so far studied our first problem, which was a comparison of performance of a single
system in an evolving setup. Next, we would like to study how do the submitted runs compare to
each other, either in terms of the absolute NDCG scores achieved on the collections, or in terms
of NDCG changes between the collections. For this, we display the ranking of runs according
in all these tasks, see Table 5.</p>
          <p>In addition, we also calculated the Pearson correlation between the rankings. The correlation
between the rankings (in terms of NDCG scores) achieved on WT and ST is very high (0.95).
The correlation between both WT and ST and between ST and LT rankings is slightly lower –
0.71 and 0.70, respectively. This corresponds with the high overlaps of the documents and also
queries between WT and ST collections and slightly smaller overlaps of the LT collection.</p>
          <p>The correlation between the ranking according to the NDCG score achieved on the WT
dataset and the ranking of the performance change is negative. The Pearson correlation is -0.65
for the ST dataset and -0.51 on the LT dataset. This means that the better the system initially
performs, harder it is to improve it. Not surprisingly, there is thus also a negative correlation
between the ranking achieved on the ST dataset and the ranking of the change between the
ST and WT dataset (-0.42). However, there is no such correlation (0.05) between the ranking
achieved on the LT dataset and ranking of the change between the WT and LT datasets.</p>
          <p>
            We also provided the normalized results to the participants. The normalization was done
according to Urbano et al. [
            <xref ref-type="bibr" rid="ref4">4</xref>
            ] and the mean and standard deviation of the scores of all submitted
runs were calculated. These scores were then used to calculate the score in normal distribution
and this score was subsequently shifted using CDF into 0-1 space. However, the correlation of
the original ranking and ranking according to the normalized values is highly correlated: 0.93,
0.95, and 0.88 for WT, ST and LT datasets, respectively. We thus further do not work with the
normalized results.
          </p>
          <p>Last, we calculated a combination of both rankings (ranking in terms of absolute values and
ranking in terms of change). For this, we first calculated a Borda count of the ranking in terms
of absolute values and Borda count of the ranking in terms of relative change and then we
simply summed these two Borda counts: these results are displayed in two last columns in
the Table 5. As the correlation between the absolute performance and performance change is
negative, the best performing runs in terms of this measure are often mediocre in one measure
and well performing in the another – for instance seupd2223-hiball_BERT run achieves high
performance change, while it is mediocre in terms of NDCG achieved on ST dataset.</p>
        </sec>
        <sec id="sec-2-2-6">
          <title>2.7. Queries Overview</title>
          <p>We further investigate performance on the provided queries. Due to the space reason, we only
investigate the queries in WT dataset, but these queries should be also well representative for
the full collection, what is also confirmed by the overlap with other query sets (see Table 2).</p>
          <p>Overview of the scores achieved for the queries in the WT collection is displayed in Figure 4.
The figure shows minimum performance (by any submitted run), 25%, quantile, 75% quantile
and the maximum achieved NDCG score. Due to a relatively large number of runs, the range of
the scores achieved is typically quite large and for some of the queries it even ranges between 0
and 1. The high diversity of the achieved scores might be even pronounced by that around half
of the runs use their original French version, while the second half uses the English translations
(some of them use both).</p>
          <p>Some of the worst performing queries are very general (the police, taxes, test car, Ofice)
and can thus be expected to be ambiguous. Two worst performing queries (Purple Potato and
gateau mascarpone) do not have any relevant documents in the qrels. As the number of relevant
documents is relatively similar for all the queries in the heldout collection (between 2 and 8),
the qrels have a limited efect on the hardness of the query. There are 2 queries with more than
10 relevant documents in the collection (potato salad, and emeraude space) and though they are
in the top 30 of the easiest queries, neither of them is in the top 15 easiest queries.</p>
        </sec>
        <sec id="sec-2-2-7">
          <title>2.8. Manual relevance judgments acquisition</title>
          <p>
            The oficial evaluation results of LongEVal IR task rely on automatic assessments generated
from clic models [
            <xref ref-type="bibr" rid="ref3">3</xref>
            ]. However, in a second step, it was decided to acquire classical manual
relevance judgments.
          </p>
          <p>
            To do that, we used the open source Doctag annotation tool [
            <xref ref-type="bibr" rid="ref5">5</xref>
            ] on a sample of 150 queries:
we selected randomly 50 queries from each of the test sets (heldout, short term and long
term queries). Doctag provides a customizable and portable platform specifically designed for
Information Retrieval (IR) evaluation. To perform manual relevance judgments using Doctag,
annotators utilize its web-based interface. They access the tool and interact with its annotation
functionalities, including the assignment of labels to indicate document relevance to specific
queries. Annotators view the documents and associate appropriate relevance labels (Fig. 5).
          </p>
          <p>
            The documents annotated come from a pooling of the participants runs [
            <xref ref-type="bibr" rid="ref6">6</xref>
            ]. For the annotation
to remain tractable, we conducted a stratified sampling and selected 150 queries for evaluation:
all documents retrieved by any of the 63 systems among top 5 documents, 50% of top 5-10 and
25% of top 10-30, are respectively assessed by the annotators. 19,678 documents from the original
dataset were then assessed. The average number of assessments per query is around 130. To
perform the manual annotation and assess document relevance for the corresponding queries,
we assigned subsets of the document dataset to a team of 37 annotators. To ensure an eficient
workflow, we set up 10 dedicated online servers where Doctag was deployed. Each annotator
was assigned to a specific server to perform the annotation tasks. This distributed setup allowed
for parallel processing, enabling annotators to work simultaneously and collaborate efectively
within their assigned subsets.
          </p>
          <p>
            In the course of the ongoing annotation process applied to the dataset under examination,
we have currently recorded an aggregate of 14,953 judgments. These judgments span across
four distinct categories: ’Relevant’, ’Not Relevant’, ’Partially Relevant’, and ’I Don’t Know’.
Preliminary analysis of the data indicates a propensity among annotators to categorize the
query-document pairs predominantly in the ’Not Relevant’ category. Figure 6 presents the
judgment distribution for the top 30 queries in terms of document count. What we see in the
Figure 6 is that the number of relevant documents is very large for some queries (with a peak
over 100 relevant documents), even larger that the non-relevant documents. This large number
of relevant documents is much larger than the threshold considered for the selection of queries
from the clic model [
            <xref ref-type="bibr" rid="ref3">3</xref>
            ]. The impact of such diferences on the evaluations will be studied before
the LongEval Workshop at CLEF.
          </p>
          <p>Further evaluation rounds utilizing the collected data are currently in progress, and the full
implications of these results will become more apparent upon the completion of the evaluation
process. We will utilize the annotated documents and relevance annotations from the queries
to construct one aggregated  file. With this Qrel file, we will run the evaluation using
trec_eval1 on the participants runs.</p>
        </sec>
        <sec id="sec-2-2-8">
          <title>2.9. Discussion and conclusion</title>
          <p>This task was a first attempt at collectively investigate the impact of the evolution of the data
on search system’s performances. Having 14 participating teams submitting runs confirmed
that this topic was of interest to the community.</p>
          <p>The dataset released for this task consisted in a sequence of test collections corresponding to
diferent times. The collections were composed of documents and queries coming from Qwant,
and relevance judgment coming from a click model and manual assessment. While the manual
assessment is ongoing at the time of the paper’s publication, performances of participants’
submitted runs were measured using the click logs.</p>
          <p>The results show that the best approaches were based on query expansion techniques, and
embeddings or Large Language Models. The efect of the translation of the documents and
queries provided by the lab has a clear impact: the best results were obtained on the original
French data.</p>
          <p>Since each subset had substantial overlaps, the correlations between systems rankings was
pretty high. As for the robustness of the systems towards dataset changes, we observed that the
systems that are the more robust to the evolution of test collection were not the best performing
ones.</p>
          <p>Further evaluations will be carried out in the near future with the manual assessment of the
pooled sets. A thorough analysis of the results will be necessary to study the impact of queries
on the results (their nature, topic, dificulty, etc.). Further analysis work will be necessary to
fully establish the robustness of the systems and the specific impact of dataset evolution on the
performances.</p>
        </sec>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Task 2 - Classification</title>
      <p>
        As the meanings of words and phrases evolve over time, sentiment classifiers may struggle to
accurately capture the changing linguistic landscape [
        <xref ref-type="bibr" rid="ref7">7</xref>
        ], resulting in decreased efectiveness in
capturing sentiments expressed in text. Recent research shows that this is particularly the case
when one is dealing with social media data [
        <xref ref-type="bibr" rid="ref8">8</xref>
        ]. Understanding the extent of this performance
drop and its implications is crucial for maintaining accuracy and reliable sentiment analysis
models in the face of linguistic drift. The objective of this task aimed to quantitatively measure
the performance degradation of sentiment classifiers over time, providing insights into the
impact of language evolution on sentiment analysis tasks and identifying strategies to mitigate
the efects of temporal dynamics. Participants of this task were invited to submit classification
outputs of their systems that attempted to mitigate the temporal performance drop.
      </p>
      <p>The aim of Task 2 was ultimately to answer the following research questions:
• RQ1: What types of models ofer better short-term temporal persistence?
• RQ2: What types of models ofer better long-term temporal persistence?
• RQ3: What types of models ofer better overall temporal persistence?</p>
      <p>To assess the extent of the performance drop of models in shorter and longer temporal gaps,
we provided training data pertaining to a specific year (2016), as well as test datasets pertaining
to a close (2018) and a more distant (2021) year. In addition to measuring performance in each of
these years separately, this setup enabled evaluating relative performance drops by comparing
performance across years.</p>
      <sec id="sec-3-1">
        <title>3.1. Description of the task</title>
        <p>
          In this section, we introduce the task of temporal persistence classification, as the focus of
a recent shared task [
          <xref ref-type="bibr" rid="ref9">9</xref>
          ]. The goal of this task was to develop classifiers that can efectively
mitigate performance drops over short and long periods of time compared to a test set from the
same time frame as the training data.
        </p>
        <p>The shared task was in turn divided into two sub-tasks:</p>
        <p>Sub-Task 1: Short-Term Persistence: In this sub-task, participants were asked to develop
models that demonstrated performance persistence over short periods of time. Specifically, the
performance of the models was expected to be maintained within a temporal gap of two years
between the training and test data.</p>
        <p>Sub-Task 2: Long-Term Persistence: This sub-task focused on developing models that
demonstrated performance persistence over a longer period of time. The classifiers were
expected to mitigate performance drops over a temporal gap of five years between the training
and test data.</p>
        <p>By providing a comprehensive training dataset, two practice sets, and three testing sets,
the shared competition aimed to stimulate the development of classifiers that can efectively
handle temporal variations and maintain performance persistence over diferent time distances.
Participants were expected to submit solutions for both sub-tasks, showcasing their ability to
address the challenges of temporal variations in performance.</p>
      </sec>
      <sec id="sec-3-2">
        <title>3.2. Dataset</title>
        <p>
          In this section, we present the process of constructing our final annotated corpus for the task.
The large-scale dataset TM-Senti was originally described in Yin et al. [
          <xref ref-type="bibr" rid="ref10">10</xref>
          ], from which we
extract samples that we use in this shared task. TM-Senti was chosen for the task as it provided
a suficiently longitudinal dataset (covering multiple years) and for using a consistent data
collection and annotation strategy, which means that only the temporal evolution of data
changes with other potentially confounding factors removed.
        </p>
        <p>Temporal granularity. In the shared task, the training set covered a time period with a
gap of 2 years, from 2014 to 2016. For the practice sets, within and distance time sets were
introduced. The Practice-2016 set had a time gap of 0 years from the training data, given that it
overlapped with the training period. In addition, the Practice-2018 set was also provided as a
distant test set to practice with, having a temporal gap of two years from the training data.</p>
        <p>For the test sets, the within set had a time gap of 0 years, covering the same period as the
within Practice-2016 set. The Test-short set had a time gap of 2 years, coinciding with the
distant Practice-2018 set. Lastly, the Test-long set had a time gap of 5 years, representing a
long-term evaluation scenario.</p>
        <p>By using these diferent time gaps, the shared task aimed to assess the models’ performance
persistence over varying temporal distances from the training data.</p>
        <p>Un-labelled data. The data was sampled from Twitter using the Twitter academic API.
Then, duplicates and near duplicates were removed. We also enforced a diversity of users and
removed tweets from most frequent users with bot-like behaviour. Finally, user mentions were
replaced by ’@user’ for anonymization, except for verified users that remained unchanged. For
all these preprocessing steps, we relied on the same pipeline and script used by Loureiro et al.
[11].</p>
        <p>Test set annotation. The test set was annotated using Amazon Mechanical Turk (AMT)2.
AMT candidate workers were filtered based on them successfully passsing two qualification
tasks. The first, built-in in the system, seeks to find workers with certain experience and located
in English-speaking countries to ensure, to a certain extent, high command of the English
language and high familiarity with AMT. The second qualification task consisted in presenting
each candidate annotator with 5 tweets, and only workers that correctly annotated 3 or more
were allowed to proceed to the actual annotation task.</p>
        <p>In total, we annotated 4,032 tweets, divided into 1874 for positive, 741 neutral and 1417
negative. Each tweet was annotated by 5 diferent workers, and the tweet’s final label was
decided by computing the mode of the array of annotations. Table 6 shows instances of the
dataset, with labels and number of agreements between 5 and 3. In terms of overall statistics,
8.5% of the tweets were annotated with full agreement, 22.8% with 4 annotators agreeing, 46%
with 3 agreements, and the remaining 22.5% with 2 agreements, which were mostly decided
between positive and neutral, and negative and neutral.</p>
        <p>Data preprocessing we preprocess our dataset to ensure its quality with respect to the
following criteria:</p>
        <p>• Diversity: All retweets and replies are eliminated.
I say this a lot But I m just so in love with Evan
Online classes r a joke
Shout out to me for living 17 minutes away from school
Honestly just a Hi from you already makes my day pos
Been one of them weeks and I just want to burst out crying neg
What s your fave throwback song to jam out to on Thursdays I have too many tbt neu
Not a good idea to mix everything but great night
just had the worst nightmare I don t want to go back to sleep
Waiting to find a man that can dance like Chris Brown
Label
pos
neg
neu
pos
neg
neu
• Consistency: We prioritise posts written in English and impose a length restriction such
that all posts contain at least 5 words and are at most 140 character long.
• Fluency: Posts containing URL links are eliminated. In addition, we select posts which
contain at least one stop word as a proxy for fluency.</p>
        <p>Before sampling, all emojis and emoticons are deleted from the body of text.</p>
        <p>
          Data sampling. In the second stage, we sample from the preprocessed data previously
obtained. As we aim for a well-balanced annotated set, the sampling strategy is defined in
terms of: 1) sentiment distribution, 2) time span and 3) post length. For 1), we use the distant
labels provided by Yin et al. [
          <xref ref-type="bibr" rid="ref10">10</xref>
          ] to obtain a balanced distribution between the negative and
positive classes. For 2), we sample an equal number of posts for each month within the specified
temporal window in each dataset. Finally for 3), we partition the data into four bins with respect
to the word length of each post ( i.e., each post falls into one of the following bins: [5,10), [10,15),
[15,20) and [20, 20+]) and uniformly sample from each bin.
        </p>
        <p>The resulting distribution of data is shown in Table 7.
The performance of the submissions was evaluated in two ways:
1. Macro-averaged F1-score: This metric measured the overall F1-score on the testing set
for the sentiment classification sub-task. The F1-score combines precision and recall to
provide a balanced measure of model performance. A higher F1-score indicated better
performance in terms of both positive and negative sentiment classification.
(1)
(2)
 −  =
2 ·  · 
 + 
2. Relative Performance Drop (RPD): This metric quantified the diference in
performance between the "within-period" data and the short- or long-term distant testing sets.
RPD was computed as the diference in performance scores between two sets. A negative
RPD value indicated a drop in performance compared to the "within-period" data, while a
positive value suggested an improvement.</p>
        <p>=
 − 0</p>
        <p>0
Where 0 represents performance when time gap is 0;  represents performance when
time gap is short or long as in was introduced in previous work [12].</p>
        <p>The submissions were ranked primarily based on the macro-averaged F1-score. This ranking
approach emphasized the overall performance of the sentiment classification models on the
testing set. The higher the macro-averaged F1-score, the higher the ranking of the submission.</p>
      </sec>
      <sec id="sec-3-3">
        <title>3.4. Results</title>
        <p>Our shared task consisted of two subtasks: Short-term persistence (Sub-task A) and
Longterm persistence (Sub-task B). Sub-task A focused on developing models that demonstrated
performance persistence within a two-year gap from the training data, while Sub-task B required
models that exhibited performance persistence over a longer period, surpassing the two-year
gap. Additionally, an unlabeled corpora covering all periods of training, development, and
testing was provided to teams interested in data-centric approaches. Along with the data,
participating teams received python-based baseline code, and evaluation scripts 3. The shared
task progressed through two phases and results are discussed in the following paragraphs.</p>
      </sec>
      <sec id="sec-3-4">
        <title>3.5. Practice phase</title>
        <p>The initial phase was the practice phase, where participants received three distantly annotated
sets, training set, within time practice set and short-term practice set. The training set was
used for model training, while the two labeled practice set allowed participants to refine their
systems before the subsequent phase. Moreover, we limited the sharing practice sets to
withintime (Practice-2016) and single distance practice sets the short-term set (Practice-2018). This
decision was made because participants were requested to take part in both sub-tasks and
reduce over-fitting. The results of this phase were not considered in final models ranking.</p>
        <p>As it can be seen from Table 8, Pablojmed showcased outstanding performance, surpassing
the Baseline model with the highest scores in F1 Score Within (0.8244) and F1 Score Short
(0.7976), as well as the highest Overall Score (0.811). saroyehun also demonstrated remarkable
performance achieving the lowest Overall Drop (-0.0310), as well as outperforming the Baseline
model in F1 Score Within (0.8170) and F1 Score Short (0.7917). The results highlight the potential
of both Pablojmed and saroyehun’s submissions for enhancing the baseline model’s results.</p>
      </sec>
      <sec id="sec-3-5">
        <title>3.6. Evaluation phase</title>
        <p>During the evaluation phase, participants were provided with three human-annotated testing
sets, namely Test-within, Test-short and Test-long (See 3.2 for datasets details). The performance
of participants on this phase was used to determine the overall rankings on the task.</p>
        <p>Short-term temporal persistence: From Table 9, we can see that still the Baseline model
is the best for achieving the highest short-term F1 Score (0.6839) among all the teams, indicating
that RoBERTA architecture has a better performance in capturing short-term patterns compared
to the other models. In same time, Cordyceps obtained the lowest short-term RPD value
(-0.0656), suggesting a smaller drop in performance compared to the Baseline model. This
indicates that Cordyceps may ofer better short-term temporal persistence despite not having
the highest Short-term F1 Score.</p>
        <p>Long-term temporal persistence: In term of long-term persistence, Pablojmed achieved
the highest f score (0.6971), indicating better performance in capturing long-term patterns
compared to the other models. However, when considering the long-term RPD measure,
pakapro obtained the lowest value (-0.0243), suggesting a smaller drop in performance compared
to the other models. This suggests that pessimistic models as in pakapro may provide a
relatively stable long-term temporal persistence despite not having the highest long-term F1
Score. Although Pablojmed obtained the highest F1 Score Long (0.6971), the model that ofers
better long-term temporal persistence, considering RPD, is pakapro. Despite its lower F1 Score
Long (0.4910), pakapro achieved the smallest long-term RPD (-0.0243) compared to the other
models. This suggests that pakapro maintains its performance more consistently over a longer
period, indicating better long-term temporal persistence.</p>
        <p>Overall temporal persistence: Considering the overall scores, Pablojmed achieved the
highest overall score (0.7029) with (-0.0708) overall RPD, indicating better overall temporal
persistence compared to the other models. However, pakapro ofers better overall temporal
persistence based on the Overall Drop metric. Indicating that pakapro’s approach may be more
persistent over time in our case despite its low F1 Scores. Overall, the best model is Pablojmed
demonstrating better overall F score and higher temporal persistence than Baseline model.
Additionally, the Baseline model performed best in short-term temporal persistence, and
pakapro shows promise for long-term temporal persistence despite not having the highest
long-term F1 Score.</p>
        <p>Systems temporal ranking: The Baseline model, ranks first in within-time and
shortterm F1 Score but drops to fourth place in long-term F1 Score. Pablojmed and Cordyceps
interchange the second and third positions in both the within-time F1 Score and short-term
F1 Score categories. This suggests a relatively consistent ranking between these two models
within these specific categories. saroyehun consistently ranks fourth in both within-time F1
Score and short-term F1 Score. pakapro shows worst performance among all and ranks fifth in
all three F scores demonstrate consistent performance across diferent timeframes compared to
the other models.</p>
        <p>It is important to note that ranking consistency varies across the diferent measures. We
can see that low RPD does not indicate better performance rather stable metric over diferent
sets. For example, if we look at the RPD metric, we see that pakapro achieves the best ranking
in long-term and Overall Drop. This indicates a lower drop in performance over longer
timeframes. However, when considering the F1 Score, pakapro ranks fifth in all three categories:
F1 Score Within, F1 Score Short, and F1 Score Long. This demonstrates that a low RPD does not
necessarily indicate better performance in terms of F1 Score.</p>
        <p>In all cases, submitted systems demonstrated their highest performance when evaluated
using the within-time held-out set. Moreover, the overall performance of participating teams
seems to have dropped between the practice phase and the final evaluation phase. Given that
participants are likely to have submitted their best models from the practice phase, it might be
the case that this drop is a result of participants having overoptimism on the practice set.</p>
      </sec>
      <sec id="sec-3-6">
        <title>3.7. Discussion</title>
        <p>Only two out of the four teams have submitted technical reports for their used models. In the
following, we delve into the discussion and interpretation of the findings concerning the three
research questions we raised in relation to our classification task. These interpretations are
solely based on the evaluation matrix, which is further explained in Section 3.3.
• Regarding RQ1, which aimed to identify the types of models ofering better short-term
temporal persistence, we observed that the Baseline model achieved the highest
shortterm F1 Score among all the teams. This indicates its strong performance in
maintaining consistency over a shorter time frame compared to its initial performance using
within-time set. Additionally, when examining the short-term RPD values, we found
that Cordyceps exhibited the smallest drop in performance compared to the Baseline
model.
• Regarding RQ2, which investigated the models ofering better long-term temporal
persistence, we observed that Pablojmed achieved the highest F1 Score for the long-term.
This indicates its superior ability to maintain performance over an extended period.
Notably, pakapro demonstrated a smaller long-term RPD compared to the other models,
suggesting its potential for maintaining performance stability over time.
• Regarding RQ3, this research question aimed to identify the models ofering better overall
temporal persistence. In this regard, Pablojmed ranked as the top performing system,
achieving the highest overall score. Its relatively low overall RPD further supports its
consistency across diferent time frames. Interestingly, pakapro demonstrated promising
results for long-term temporal persistence, despite not achieving the highest long-term
F1 Score.</p>
        <p>By delving into the evaluation matrix results, we provided insights into the performance
trends observed among the participating systems. However, it is essential to acknowledge that
the absence of the submission from a certain number of systems may have influenced the overall
interpretation of the findings. To address this limitation, we made our leaderbored available for
future submissions in Codalab 4. This should ensure more robust and unbiased assessment for
the temporal persistence of text classifiers within the research community.</p>
      </sec>
      <sec id="sec-3-7">
        <title>3.8. Conclusion</title>
        <p>Overall findings highlight the importance of evaluating temporal persistence in model
performance. The identified models showcase varying levels of persistence in both short-term and
long-term persistence. These insights provide valuable guidance for future research and
development eforts aimed at improving temporal consistency in machine learning models. In future
shared tasks, we aim to incorporate evolving training sets as well as expanding out temporal
persistence investigation to more tasks including stance detection and topic categorization.</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>Acknowledgments</title>
      <p>This work is supported by the ANR Kodicare bi-lateral project, grant ANR-19-CE23-0029 of
the French Agence Nationale de la Recherche, and by the Austrian Science Fund (FWF, grant
I4471-N). This work is also supported by a UKRI/EPSRC Turing AI Fellowship to Maria Liakata
(grant no. EP/V030302/1) and The Alan Turing Institute (grant no. EP/N510129/1) through
project funding and its Enrichment PhD Scheme for Iman Bilal. This work has been using
services provided by the LINDAT/CLARIAH-CZ Research Infrastructure (https://lindat.cz),
4https://codalab.lisn.upsaclay.fr/competitions/12762
supported by the Ministry of Education, Youth and Sports of the Czech Republic (Project No.
LM2018101) and has been also supported by the Ministry of Education, Youth and Sports of the
Czech Republic, Project No. LM2018101 LINDAT/CLARIAH-CZ.
and analysis of a longitudinal twitter sentiment dataset, arXiv preprint arXiv:2108.13898
(2021).
[11] D. Loureiro, F. Barbieri, L. Neves, L. Espinosa Anke, J. Camacho-collados, TimeLMs:
Diachronic language models from Twitter, in: Proceedings of the 60th Annual Meeting of
the Association for Computational Linguistics: System Demonstrations, Association for
Computational Linguistics, Dublin, Ireland, 2022, pp. 251–260. URL: https://aclanthology.
org/2022.acl-demo.25. doi:10.18653/v1/2022.acl-demo.25.
[12] R. Alkhalifa, E. Kochkina, A. Zubiaga, Opinions are made to be changed: Temporally
adaptive stance classification, in: Proceedings of the 2021 Workshop on Open Challenges
in Online Social Networks, 2021, pp. 27–32.</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>R. Gangi</given-names>
            <surname>Reddy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Iyer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. A.</given-names>
            <surname>Sultan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Sil</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Castelli</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Florian</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Roukos</surname>
          </string-name>
          ,
          <article-title>Synthetic target domain supervision for open retrieval qa</article-title>
          ,
          <source>in: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval</source>
          , SIGIR '21,
          <string-name>
            <surname>Association</surname>
          </string-name>
          for Computing Machinery, New York, NY, USA,
          <year>2021</year>
          , p.
          <fpage>1793</fpage>
          -
          <lpage>1797</lpage>
          . URL: https://doi.org/10.1145/3404835.3463085. doi:
          <volume>10</volume>
          .1145/3404835.3463085.
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          , I. Bilal,
          <string-name>
            <given-names>H.</given-names>
            <surname>Borkakoty</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Camacho-Collados</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Deveaud</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>El-Ebshihy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Espinosa-Anke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Gonzalez-Saez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Goeuriot</surname>
          </string-name>
          , E. Kochkina,
          <string-name>
            <given-names>M.</given-names>
            <surname>Liakata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Loureiro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. T.</given-names>
            <surname>Madabushi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Mulhem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Piroi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Popel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Servan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>Overview of the clef-2023 longeval lab on longitudinal evaluation of model performance, in: Experimental IR Meets Multilinguality, Multimodality, and Interaction</article-title>
          .
          <source>Proceedings of the Fourteenth International Conference of the CLEF Association (CLEF 2023), Lecture Notes in Computer Science (LNCS)</source>
          , Springer, Thessaliniki, Greece,
          <year>2023</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Deveaud</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Gonzalez-Saez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Mulhem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Goeuriot</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Piroi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Popel</surname>
          </string-name>
          ,
          <article-title>Longeval-retrieval: French-english dynamic test collection for continuous web search evaluation</article-title>
          ,
          <year>2023</year>
          . arXiv:
          <volume>2303</volume>
          .
          <fpage>03229</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>J.</given-names>
            <surname>Urbano</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Lima</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Hanjalic</surname>
          </string-name>
          , A New Perspective on Score Standardization, in:
          <source>International ACM SIGIR Conference on Research and Development in Information Retrieval</source>
          ,
          <year>2019</year>
          , pp.
          <fpage>1061</fpage>
          -
          <lpage>1064</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>F.</given-names>
            <surname>Giachelle</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Irrera</surname>
          </string-name>
          , G. Silvello,
          <article-title>Doctag: A customizable annotation tool for ground truth creation</article-title>
          ,
          <source>in: Advances in Information Retrieval: 44th European Conference on IR Research</source>
          , ECIR
          <year>2022</year>
          , Stavanger, Norway,
          <source>April 10-14</source>
          ,
          <year>2022</year>
          , Proceedings,
          <string-name>
            <surname>Part</surname>
            <given-names>II</given-names>
          </string-name>
          , volume
          <volume>13186</volume>
          of Lecture Notes in Computer Science, Springer,
          <year>2022</year>
          , pp.
          <fpage>288</fpage>
          -
          <lpage>293</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>D.</given-names>
            <surname>Harman</surname>
          </string-name>
          ,
          <string-name>
            <surname>TREC-Style</surname>
            <given-names>Evaluations</given-names>
          </string-name>
          , Springer Berlin Heidelberg, Berlin, Heidelberg,
          <year>2013</year>
          , pp.
          <fpage>97</fpage>
          -
          <lpage>115</lpage>
          . URL: https://doi.org/10.1007/978-3-
          <fpage>642</fpage>
          -36415-
          <issue>0</issue>
          _7. doi:
          <volume>10</volume>
          .1007/ 978-3-
          <fpage>642</fpage>
          -36415-
          <issue>0</issue>
          _
          <fpage>7</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>Capturing stance dynamics in social media: open challenges and research directions</article-title>
          ,
          <source>International Journal of Digital Humanities</source>
          (
          <year>2022</year>
          )
          <fpage>1</fpage>
          -
          <lpage>21</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Kochkina</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>Building for tomorrow: Assessing the temporal persistence of text classifiers</article-title>
          ,
          <source>arXiv preprint arXiv:2205.05435</source>
          (
          <year>2022</year>
          ).
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [9]
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          , I. Bilal,
          <string-name>
            <given-names>H.</given-names>
            <surname>Borkakoty</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Camacho-Collados</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Deveaud</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>El-Ebshihy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Espinosa-Anke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Gonzalez-Saez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Goeuriot</surname>
          </string-name>
          , E. Kochkina,
          <string-name>
            <given-names>M.</given-names>
            <surname>Liakata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Loureiro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. Tayyar</given-names>
            <surname>Madabushi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Mulhem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Piroi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Popel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Servan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          , Longeval:
          <article-title>Longitudinal evaluation of model performance at clef 2023</article-title>
          , in: J.
          <string-name>
            <surname>Kamps</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Goeuriot</surname>
            ,
            <given-names>F.</given-names>
          </string-name>
          <string-name>
            <surname>Crestani</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Maistro</surname>
            ,
            <given-names>H.</given-names>
          </string-name>
          <string-name>
            <surname>Joho</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Davis</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Gurrin</surname>
            ,
            <given-names>U.</given-names>
          </string-name>
          <string-name>
            <surname>Kruschwitz</surname>
            ,
            <given-names>A</given-names>
          </string-name>
          . Caputo (Eds.),
          <source>Advances in Information Retrieval</source>
          , Springer Nature Switzerland, Cham,
          <year>2023</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [10]
          <string-name>
            <given-names>W.</given-names>
            <surname>Yin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Alkhalifa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zubiaga</surname>
          </string-name>
          ,
          <article-title>The emojification of sentiment on social media: Collection</article-title>
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>