<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Overview of the “Voight-Kampf ” Generative AI Authorship Verification Task at PAN and ELOQUENT 2025</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Janek Bevendorf</string-name>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Yuxia Wang</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jussi Karlgren</string-name>
          <xref ref-type="aff" rid="aff14">14</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Matti Wiegmann</string-name>
          <xref ref-type="aff" rid="aff15">15</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Maik Fröbe</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Akim Tsivgun</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jinyan Su</string-name>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Zhuohan Xie</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Mervat Abassy</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jonibek Mansurov</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Rui Xing</string-name>
          <xref ref-type="aff" rid="aff11">11</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Minh Ngoc Ta</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Kareem Ashraf Elozeiri</string-name>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tianle Gu</string-name>
          <xref ref-type="aff" rid="aff13">13</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Raj Vardhan Tomar</string-name>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jiahui Geng</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ekaterina Artemova</string-name>
          <xref ref-type="aff" rid="aff12">12</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Artem Shelmanov</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nizar Habash</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Efstathios Stamatatos</string-name>
          <xref ref-type="aff" rid="aff16">16</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Iryna Gurevych</string-name>
          <xref ref-type="aff" rid="aff10">10</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Preslav Nakov</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Martin Potthast</string-name>
          <xref ref-type="aff" rid="aff15">15</xref>
          <xref ref-type="aff" rid="aff17">17</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Benno Stein</string-name>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Alexandria University</institution>
          ,
          <country country="EG">Egypt</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>BKAI Research Center, Hanoi University of Science and Technology</institution>
          ,
          <country country="VN">Vietnam</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Bauhaus-Universität Weimar</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Cluster Innovation Center, University of Delhi</institution>
          ,
          <country country="IN">India</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Cornell University</institution>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Friedrich-Schiller-Universität Jena</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Leipzig University</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>Mohamed bin Zayed University of Artificial Intelligence</institution>
          ,
          <addr-line>UAE</addr-line>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>Nebius AI, Netherlands; KU Leuven</institution>
          ,
          <country country="BE">Belgium</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>New York University Abu Dhabi</institution>
          ,
          <addr-line>UAE</addr-line>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff10">
          <label>10</label>
          <institution>TU Darmstadt</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff11">
          <label>11</label>
          <institution>The University of Melbourne</institution>
          ,
          <country country="AU">Australia</country>
        </aff>
        <aff id="aff12">
          <label>12</label>
          <institution>Toloka AI</institution>
          ,
          <country country="NL">Netherlands</country>
        </aff>
        <aff id="aff13">
          <label>13</label>
          <institution>Tsinghua University</institution>
          ,
          <country country="CN">China</country>
        </aff>
        <aff id="aff14">
          <label>14</label>
          <institution>University of Helsinki</institution>
          ,
          <country country="FI">Finland</country>
        </aff>
        <aff id="aff15">
          <label>15</label>
          <institution>University of Kassel</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff16">
          <label>16</label>
          <institution>University of the Aegean</institution>
          ,
          <country country="GR">Greece</country>
        </aff>
        <aff id="aff17">
          <label>17</label>
          <institution>hessian.AI</institution>
          ,
          <addr-line>Germany; ScaDS.AI</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2025</year>
      </pub-date>
      <abstract>
        <p>The “Voight-Kampf” Generative AI Authorship Verification task aims to determine whether a text was generated by an AI or written by a human. The 2025 edition of the task explores two subtasks: Subtask 1 tests the detection of purely AI generated text with potentially unknown obfuscations, and as such continues our research from 2024. The task is again organized as a builder-breaker challenge together with the ELOQUENT lab. The PAN participants submitted 24 detectors. The best system archives a mean score of 0.99, the best baseline achieves a score of 0.92. ELOQUENT participants submitted 13 new test datasets with 22 obfuscated texts each. The most dificult dataset archives a mean C@1 − 1 score of 0.63. Subtask 2 investigates texts with six degrees of human-AI collaboration: (i) fully human-written, (ii) humanwritten, then machine-polished, (iii) machine-written, then machine-humanized (obfuscated), (iv) human-initiated, then machine-continued, (v) deeply mixed text,where some parts are written by a human and some are generated by a machine, and (vi) machine-written, then human-edited. The dataset contains over half a million examples in total and is composed from several relevant AI-detection datasets across multiple text genres. PAN participants submitted 21 detectors to subtask 2. The best system archives an F1 score of 0.65, the best baseline a score of 0.48. The data, baselines, and the code used for creating the datasets and evaluating the systems are available.1</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;Generative AI Detection</kwd>
        <kwd>LLM Detection</kwd>
        <kwd>Human-AI Collaboration</kwd>
        <kwd>Workshop</kwd>
        <kwd>PAN</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>Authorship verification is a fundamental task in author identification. PAN has continuously been
organizing authorship verification tasks for years [ 1, 2, 3, 4] and with generative AI / LLM detection
being fundamentally also an authorship verification task [5], we decided to “delve” into that realm.</p>
      <p>For the 2025 edition of this task, we increase the dificulty of the binary detection setting and add
a new subtask with a focus on detecting human-AI collaboration. In the first edition of this task [ 6],
“Voight-Kampf” was organized as a classic authorship verification task: Given two texts, one authored
by a human, one by a machine: pick out the human. Against our initial assumptions, the task was of
little challenge to the participants’ systems even in the face of text obfuscation. Thus, for this year, we
elevate the challenge in two directions.</p>
      <p>First, we reduce the original “Voight-Kampf” setting to a binary task: Given a text, determine if it was
written by a human or an AI model. In addition, we extend our selection of obfuscation methods to test
model sensitivity and add larger, newer, and more dificult-to-detect models. As before, the PAN and
ELOQUENT labs jointly organize this task: ELOQUENT participants provide very strongly obfuscated
texts from a set of summaries, and PAN participants develop systems to robustly detect AI generated
text. The ELOQUENT contributions are described in Section 2 and the PAN contributions are described
in Section 3. As in the previous year, all systems are submitted as immutable docker containers via
TIRA [7] for easy reproducibility.</p>
      <p>Second, we organize a new subtask aiming to detect human-AI collaboration, since joint writing
and editing of documents with AI tools has become commonplace and such cases are presumably
harder to detect than in the binary case. The subtask is a logical continuation of previous shared tasks
organized at SemEval and GenAIDetect [8, 9]. Subtask 2 asks to distinguish six cases of joint human-AI
authorship: (i) fully human-written, (ii) human-written, then machine-polished, (iii) machine-written,
then machine-humanized (obfuscated), (iv) human-initiated, then machine-continued, (v) deeply mixed
text, in which some parts are written by a human and some are generated by a machine, and (vi)
machine-written, then human-edited. This subtask is described in Section 4.</p>
    </sec>
    <sec id="sec-2">
      <title>2. ELOQUENT: Generating Hard-to-detect AI-texts</title>
      <p>ELOQUENT participants generate datasets of machine text, attempting to break the classifier systems
built by PAN participants.</p>
      <p>In its first edition, the classifiers submitted by participants to the PAN lab handily classified the texts
into human vs. machine. We found that of the submitted datasets in 2024, all were able to fool some
of the classifier systems some of the time; but no generative model was consistently able to convince
the better classifier systems that it was human. It was clear that machine-generated texts appeared to
consistently hold to certain detectable stylistic indicator features. [6]</p>
      <sec id="sec-2-1">
        <title>2.1. Dataset</title>
        <p>For the test set, 22 texts written by human authors, of between 350–700 words were selected. Most
original texts were longer and a suitably long section of the text was selected. Summaries of each text
were generated by the organizers using OpenAI’s ChatGPT service using the prompt “Summarize the
main points of the following text and give an overall description of the genre and tone of the text.”
Those summaries were then shared to the participants for their systems to generate short texts on the
basis of the summaries. A sample summary test item is given in 1 and a list of item titles is given in 1.</p>
        <p>A suggested prompt was given with the participants—“Write a text of about 500 words which covers
the following items”—but the participants were free to formulate their own prompts as they saw fit.
The generated texts were submitted by the participants through a submission form, and then further
submitted by the organizers to the PAN lab for classification.</p>
        <sec id="sec-2-1-1">
          <title>Genre and Style:</title>
          <p>Genre: Mythological and Linguistic Ethnography / Cultural Anthropology
Tone: Scholarly, reverent, and lyrical, blending academic analysis with a poetic appreciation of
language, mythology, and cultural worldview.
Finnish language and culture are deeply intertwined with nature, with precise and acoustically rich
verbs used to describe natural elements like snow, wind, and animals.</p>
          <p>Ancient Finns practiced animistic nature-worship, viewing all visible forces sun, moon, sea, earth as
living, conscious beings.</p>
          <p>Over time, belief evolved to include invisible spiritual beings, or habitat (genii/regents), who governed
natural elements and had both form and spirit, though lesser ones were more formless and abstract.
These haltiat were immortal and hierarchical, often ranked based on the significance of their domain
(e.g., Tapio of the forest outranking Pilajatar, daughter of the aspen).</p>
          <p>Finnish mythology emphasizes the independence and dignity of each deity, regardless of power; even a
minor god rules absolutely within their sphere.</p>
          <p>Deities were typically paired and familial, with the sky and celestial bodies being the earliest and most
revered objects of worship, leading to the concept of Jumala, the thunder-home, as the supreme god.</p>
        </sec>
      </sec>
      <sec id="sec-2-2">
        <title>2.2. Submissions</title>
        <sec id="sec-2-2-1">
          <title>Model</title>
          <p>GPT-2
Qwen3-8B
Qwen3-8B
Qwen3-8B
Qwen3-8B
Claude Sonnet3.5
Claude Sonnet3.5
0.136
Of the 49 registered participants, five teams submitted 10 experiments [ 10, 11, 12, 13, 14]. The attrition
rate is great, similar to last year, and we will investigate the possibility of turning this task into a
continuously open experiment with asynchronous submission.</p>
          <p>Table 2 lists the participating systems and the classification results per system using the C@1 accuracy
score aggregated from the PAN lab participants.</p>
        </sec>
      </sec>
      <sec id="sec-2-3">
        <title>2.3. Observations</title>
        <p>This year, we find that of the submitted ELOQUENT-generated datasets, all were able to fool some of
the classifier systems some of the time, and several were able to do so better than chance. This is a
considerable improvement from last year, and reflects a more directed experimentation in the submitted
experiments, e.g., by inserting explicit human variation and departure from norm in the generated texts.</p>
        <p>The human authored texts were on average misclassified as machine generated only 15% of the time
and this is entirely due to two texts which both caused a majority of the classification systems to assign
them a non-zero likelihood to be machine-generated: the excerpt from the Maastricht treaty (038) and
the Intro to Large Language Models (045). All other human-authored texts were correctly assumed to
be human-authored by every participating classifier system. The best generative models fooled the
classifiers with many, but never with all of their generated texts. Some of the test items appear to have
been easier to generate convincing human-like output for: Session Moderator Instructions (036), What
is Free Software? (042), and the Wikipedia text on Safron (052). It is unclear if this is an efect of the
qualities of the summaries or if the language model training data are better equipped for academic text
than for other genres.</p>
        <p>As a very general finding, it is clear that generative language models still have recognizable traits in
their output and that classifiers are able to pick up on them quite efectively. Generating truly human-like
text remains an open challenge for developers and operators of generative language models.</p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. PAN Subtask 1: Voight-Kampf AI Detection Sensitivity</title>
      <p>At PAN 2024 [15], we ofered, for the first time, the “Voight-Kampf” Generative AI Authorship Verification
task [6], which attracted a large number of submissions. To start with, we formalized diferent task
variants and ordered them from easiest to hardest (Figure 2). To establish a baseline, we decided to start
with the easiest variant, in which participants were given a pair of texts, of which exactly one was of
human and the other of machine origin.</p>
      <p>Input / Task
1. { ? , ? }
2. { ? , ? }
3. { ? , ? }
4. { ? , ? }
5. { ? , ? }
6. { ? , ? }
7.</p>
      <p>?
→−</p>
      <p>Possible Assignment Patterns
1. { A , M }
2. { A , M }, { A , A }
3. { A , M }, { M , M }
4. { A , M }, { A , A }, { M , M }
5. { A , M }, { A , A }, { A , B }
6. { A , M }, { A , A }, { A , B }, { M , M }
7.</p>
      <p>A ,</p>
      <p>M</p>
      <p>For PAN 2025 [16], we move on to the harder variant, in which participants are given only one text.
This variant reflects a more realistic scenario of authorship verification “in the wild,” and it also aligns
with the settings commonly addressed in other LLM detection shared tasks. The PAN 2025 subtask 1 is
again co-organized with the ELOQUENT [17] in a builder-breaker style.</p>
      <p>The subtask 1 is in essence the classic binary detection task known also from other LLM detection
shared tasks. However, we are testing the limits of the detectors by crafting a test set with text
“obfuscations” that try to evade detection. Apart from drastic text length restrictions, the obfuscations
we tested or received from ELOQUENT participants in the previous year had turned out to be mostly
inefective. So this year, we tested what happens when the human writers obfuscate their style and
whether machines can replicate this.</p>
      <sec id="sec-3-1">
        <title>3.1. Dataset</title>
        <p>The training, validation, and test datasets were built from a selection of 19th-century English fiction
from Project Gutenberg,1 the Extended Brennan-Greenstadt [18] corpus, and the Riddell-Juola [19]
corpus. We also included a sample of the PAN’24 training and test sets (containing U.S. news articles
from 2021) in this year’s training and test sets (the PAN’24 test data, which were never openly shared).
Participants were free to use any other sources for augmenting their training data, including the PAN’24
training set.</p>
        <p>The (Extended) Brennan-Greenstadt corpus has been used a lot in the fields of authorship verification
and authorship obfuscation. For its construction, volunteers on Amazon Mechanical Turk were asked to
submit existing writing samples (about 500 words) of their own. They were then asked to write another
text describing their neighborhood, but in a way that hides their own writing style. No particular
instructions were given for how this style obfuscation was to be achieved. They were also asked to write
a third text in which they should try to imitate a particular style from a given sample of a novel. For
PAN, we used the original and obfuscated neighborhood essays only. We chose this corpus in particular,
as the obfuscated nature of the texts adds an interesting aspect to the classification task.</p>
        <p>The Riddell-Juola corpus was created for a replication study of the original experiments by Brennan
et al., but on a larger scale and with an additional control group. Volunteers were also asked to submit
existing writing samples and a new piece about their neighborhood. However, the control group was
not instructed to obfuscate their style.</p>
        <p>You are an essay summarizer and a forensic writing style analyst. Given an essay, you summarize its key
points in ten bullet points, extract the main topic in just a few words, and some other details.
As additional details, classify the point of view from which the essay is written ("first-person",
"third-person", "second-person") and the tense ("past-tense" or "present-tense"). If the essay is
argumentative, classify the author’s stance on the main topic as "pro" or "con" or "neutral" if the article is
not argumentative.</p>
        <p>Point out any traits that make the author’s writing style unique, but phrase them as instructions for
another writer who wants to imitate the style. For example: "Use very short sentences," "Use passive voice
a lot," "Write in a nominal style with few adjectives," "Use very poetic and descriptive language," "Add
spelling mistakes," "Start multiple sentences with the same word," "Use the word X more often than usual,"
"Write in long and nested sentences," "Use a lot of technical terms," "Use very simple language," "Use
metaphors/similes/anaphoras/alliterations/...," etc.</p>
        <p>Answer in structured JSON format (without Markdown formatting) like so:
{
"key_points": ["key point 1", "key point 2", ...],
"main_topic": "essay topic",
"pov": "narrative point of view, either first-person, third-person, or
second-person",
"tense": "essay tense, either past-tense or present-tense",
"stance": "stance on topic, either pro, con, or neutral",
"style_instructions": ["style instruction 1", "style instruction 2", ...]
}</p>
        <p>From Project Gutenberg, we sampled 927 English novels tagged as “19th-century” and “Fiction,”
trying not to duplicate individual titles. The texts were cleaned of headers and footers and split into no
more than 10 chunks of 500–700 words, resulting in a total of 9,185 original human texts.</p>
        <sec id="sec-3-1-1">
          <title>3.1.1. Summary Generation</title>
          <p>From the essays, we generated JSON-structured bullet-point summaries with GPT-4o using the prompt
displayed in Figure 3. The model was instructed to extract a list of key points from the text, but
also stylistic markers such as the narrative point of view, the tense, stance, and any other traits that
seem unique to that text (such as the use of many technical terms, very poetic language, etc.). The
neighborhood texts were summarized with a similar prompt, but with the additional information
that the text describes a neighborhood and that the author tried to hide their writing style. For the
19th-century fiction texts, a much simpler prompt was used that asked only for a bullet-point summary
given a section of a novel.</p>
        </sec>
        <sec id="sec-3-1-2">
          <title>3.1.2. Machine Text Generation</title>
          <p>From the summaries, we generated the final machine texts using 14 diferent LLMs: GPT-3.5 Turbo,
GPT-4o, GPT-4o-mini, GPT-4.5-preview, OpenAI o1, OpenAI o1-mini, OpenAI o3-mini, Gemini 1.5 Pro,
Gemini 2.0 Flash, DeepSeek-R1-Qwen-32b, Falcon3-10b, Llama3.1-8b, Llama3.3-70b, Ministral-8b-2410.</p>
          <p>
            Using the summaries extracted earlier, we asked the models to write (
            <xref ref-type="bibr" rid="ref1">1</xref>
            ) argumentative essays for each
of the original and obfuscated neighborhood essays from the Brennan-Greenstadt and Riddell-Juola
corpora; and (
            <xref ref-type="bibr" rid="ref2">2</xref>
            ) continuations of the novel chunks as faithful pastiches in honor of the original authors.
In both cases, the models were instructed to follow the given style instructions. For the neighborhood
essays, the models were explicitly told to follow these instructions to mask their own style. For the
novels, we asked the models not to plagiarize the original texts, since we were almost sure that Project
Gutenberg was part of the training data of most models. Despite this instruction, some of the models
still reproduced parts of the original novels, but to a degree we deemed acceptable (e.g., OpenAI o1-mini
reproduced two paragraphs of Charlotte Brontë’s “The Professor” before continuing with new text).
          </p>
        </sec>
        <sec id="sec-3-1-3">
          <title>3.1.3. Machine Text Obfuscation</title>
          <p>
            In addition to the style-informed prompts described above, we also prompted GPT-4o and OpenAI
o1 with five variations of the essay prompts, in which we instructed the models (
            <xref ref-type="bibr" rid="ref1">1</xref>
            ) to write like a
7-year-old, (
            <xref ref-type="bibr" rid="ref2">2</xref>
            ) write like a 7-year-old, but only given the main topic and no key points, (
            <xref ref-type="bibr" rid="ref3">3</xref>
            ) start every
sentence with an alliteration, (
            <xref ref-type="bibr" rid="ref4">4</xref>
            ) read but ignore a list of 200 random words we appended to the real
prompt, (
            <xref ref-type="bibr" rid="ref5">5</xref>
            ) write in Japanese subject-object-verb word order (“Yoda grammar”).
          </p>
          <p>As a sixth obfuscation, we increased the temperature of DeepSeek R1 to a value of 1.1, the highest
that still produced sensible text.</p>
          <p>Finally, we also paraphrased already-generated texts from GPT-4o, GPT-4.5, and DeepSeek R1 using
the adversarial DIPPER paraphraser [20]. Unfortunately, we were unable to get a satisfactory text
quality from this. We used the implementation and pre-trained models provided by the authors, yet the
paraphraser often generated long strings of repeated white space and punctuation marks and sometimes
degenerated entirely into seemingly random tokens. We got this result with both the context-aware
and the context-unaware model and with all sampling parameters we tested. To improve the quality,
we tried to clean up at least some of the repeated characters using a regular expression.</p>
        </sec>
        <sec id="sec-3-1-4">
          <title>3.1.4. Dataset Sampling and Splits</title>
          <p>From all generated texts, we randomly sampled a subset with a varying balance of human to LLM
texts. We sampled from the Gutenberg set with a human:LLM ratio of 1:1, from Brennan-Greenstadt,
Riddell-Juola, and PAN’24 training with a 1:4 ratio, and PAN’24 test with a 1:1 ratio.</p>
          <p>The sampled texts were split into training, validation, and test sets. Texts created by OpenAI o1
and o1-mini texts were held back and included only in the test set. This was inspired by our earlier
ifndings [ 5] that OpenAI o1 was much harder to detect when detectors were trained only on other
LLMs. For this reason, we also created new o1 texts from the PAN’24 news article summaries. The
texts provided by ELOQUENT participants were later added to the test set without balancing them. We
kept track of the generating models and the individual source datasets, so we could later distinguish
between diferent subsets of the full test set during the evaluation phase. For this reason, we distinguish
between the terms dataset, which refers to all data from all sources, and data subset, which refers to
texts from an individual source. The subsets were chosen so that each subset contains both classes
(with the exception of the OpenAI o1 and DIPPER-obfuscated Gutenberg subsets), which is crucial for
measures such as ROC-AUC and F1. A detailed listing of all subsets, splits, and class balances is given
in Table 3.</p>
        </sec>
      </sec>
      <sec id="sec-3-2">
        <title>3.2. Baselines</title>
        <p>
          We provide three baselines: Two zero-shot baselines and one supervised. Baseline (
          <xref ref-type="bibr" rid="ref1">1</xref>
          ) is an
implementation of a Binoculars [21] model using Llama-3.1-8b and Llama-3.1-8b-instruct. The decision threshold
was optimized for high accuracy (not for a low false-positive rate as in the original paper) on the
validation set that was handed out to participants. Baseline (
          <xref ref-type="bibr" rid="ref2">2</xref>
          ) is a simple PPMd-based compression
model using the compression-based cosine measure [22, 23]. The operating point of this detector was
also tuned on the validation set. Lastly, as a supervised baseline, (
          <xref ref-type="bibr" rid="ref3">3</xref>
          ) we provide a linear SVM trained on
the top-1000 TF-IDF 1–4-grams from the validation set. The TF-IDF detector and Binoculars can be
considered state of the art; the compression model represents a more conservative lower baseline.
        </p>
      </sec>
      <sec id="sec-3-3">
        <title>3.3. Submissions</title>
        <p>We received submissions from 24 teams, 21 of which also submitted a work notebook describing their
approach. The following section gives an overview of them with a short description for each. Table 4
summarizes the use of certain feature families and methods employed by the submitted systems.
Basani and Chen [36] use GPT-2 to estimate mean, variance, skewness, and kurtosis of the negative
per-token log likelihood (token “surprisal”) distributions of a text. In addition to these stationary
statistical moments, they also calculate their first- and second-order rates of change. An XGBoost
classifier is learned on these features for distinguishing between human and LLM texts.
Huang et al. [40] fine-tune a RoBERTa model for binary classification, but does extensive data
augmentation work. The existing training data is modified via synonym replacement and sentence
reorganization, and noise is added via random repetitions or stop and fill word insertions. In addition,
new generated documents are added using models not included in the training data, where diferent
prompts are used to generate text across diferent genres to roughly double the number of generated
training texts.</p>
        <p>Jimeno-Gonzalez et al. [32] use stylometric and word-frequency features with a stacking ensemble
of Random Forest, XGBoost, and LinearSVC, where a logistic regression classifier casts the ensemble
vote. Specifically, the system uses the function-word ratio, average tf-idf scores, mean sentence length,
mean word length, number of sentences, POS-tag distribution, and word frequency features over the
vocabulary.</p>
        <p>Kumar et al. [42] fine-tune a DistilBERT for binary classification on the provided training data. As
features, the system uses the base model’s CLS embedding concatenated with five stylometric features:
average word length, average sentence length, punctuation frequency, type-token ratio, and character
entropy.</p>
        <p>Larson [39] aim to be especially light-weight and use a Support Vector Machine with RBF kernel
and class weighting for binary classification. As features, the system uses the 40 most frequent 1- and
2-grams, along with the 15 most frequent punctuation features, selected based on performance on the
validation set.</p>
        <p>Liang et al. [43] fine-tune a ModernBERT for binary classification and use a custom loss function
weighted by an example’s dificulty.</p>
        <p>Liu et al. [25] use an ensemble of a fine-tuned Qwen and a fine-tuned ModernBERT with contrastive
loss to distinguish between human and LLM-generated text. The dataset was augmented by generating
LLM paraphrases of the human texts.
Macko [24] fine-tune a Qwen3-14B model via QLoRA for binary classification. What makes the
approach work is the obfuscation via a homoglyph (replacement of characters with similar-looking ones
Unicode characters) attack of parts of the training data (a variant of training data enhancement) and
model selection on external training data to select the best model based on out-of-domain performance.
The external training data is a collection of 2,000 examples across seven languages, sampled from 18
diferent AI-detection datasets of diferent genres and domains [44].</p>
        <p>Marchitan et al. [30] describe two system: First, a voting-ensemble of LightGBM, XGBoost, Logistic
Regression, and an SVM using embeddings from Qwen3-0.6B, and, second, fine-tuning a classification
head on top of an LLM, where Qwen2.5-0.5B, Qwen3-0.6B, Mistral7B-v0.1, and Llama-3.1-8B were
tested.</p>
        <p>Ochab et al. [34] use an LGBM classifier on four types of stylometric features extracted using spaCy
and inspired by the “stylo” R package. The dataset was augmented by adding about half a million texts
from other AI detection datasets. 10-fold cross-validation was used to select the best hyperparameters
for the model.</p>
        <p>Ostrower et al. [35] propose three systems. The first system is a XGBoost ensemble using the
Binoculars score, tf-idf scores, and BERT embeddings as features. The second, not submitted system
uses a Maximum Likelihood Estimation based on various features: cohesiveness, type-token ratio,
word count, stop-word frequency, non-sentiment word frequency, POS distribution. Cohesiveness
is computed via the average BARTScore (between a text and multiple noisy, obfuscated copies) and
multiple zero-shot LLM detectors (LRR, FastDetectGPT). The third approach uses adversarial training
following Hu et al. [45].</p>
        <p>Pudasaini et al. [37] use various ensembling strategies (voting, stacking, and boosting) with a
number of fine-tuned pre-trained language models (such as DeBERTa, Longformer, RoBERTa, etc.). The
results were tested on the PAN’25 dataset and the COLING’25 dataset.</p>
        <p>Teja et al. [31] fine-tune several pre-trained language models (such as DeBERTa, DistilBERT, XLNet,
and others) with a mixture-of-experts gating mechanism. They tested a hard gating mechanism which
selects only a single experts and a soft gating mechanism which uses a linear layer with softmax to
select a weighted sum of expert outputs. A DeBERTa model with hard gating performed best on the
validation set and was submitted to Tira.</p>
        <p>Seeliger et al. [26] generate a matrix of cumulative binary correlation coeficients between terms
and documents. Three diferent versions were submitted using word unigrams, bigrams, and trigrams
as terms. The authors also supplied two baselines, one of which is a fine-tuned RoBERTa model. The
approach allows for stationary analysis of the whole document, but also temporal analysis of the
cumulative sum of correlation coeficients per word.</p>
        <p>Sun et al. [38] use a combination of 25 stylometric and 25 entropy-based features and a voting
ensemble of five diferent classifiers (Gaussian Naive Bayes, AdaBoost, LightGBM, CatBoost, Random
Forest). The stylometric features selected via univariate feature selection from 101 features suggested
by the Claude LLM. The entropy-based features are selected via univariate feature selection from 72
statistical features describing the distribution of per-token forward and backward cross-entropy losses,
following Guo et al. [46]. These losses are computed by “regenerating” a training text via teacher
forcing, given a generated summary as prompt, and measuring the loss of each predicted output logits
to the target (forward) and to the last token of the prefix (backward), according to a Llama2-7b.
Titze and Halvani [41] use of-the-shelf pre-trained LLMs to extract negative log likelihood
“surprisal”, mean Shannon entropy, log rank, Jensen-Shannon divergence from the token representation of
the text. A logistic regression classifier is learnt to combine the features into a final score.
Valdez-Valenzuela generate a syntactic dependency graph representation of the text. Sentence-level
graphs are merged into document-level graphs, which are embedded using a graph neural network for
use in a dense neural network for classification. The dataset is augmented with three diferent kinds of
obfuscations (shortening, Unicode replacement, paraphrasing) to make the system more robust.
Völpel and Halvani [33] use linearized constituent trees n-grams as features for a feed-forward
neural network classifier. The trees are parsed via the Constituent Treelib library [47] and segmented
by traversing from every node to all possible leaves and counting all encountered paths of length 1-7.
Voznyuk et al. [27] use multi-task learning with DeBERTa-v3 base. The system learns three tasks,
each utilizing a diferent head: the binary Voight-Kampf task, a 3-class genre prediction task, and a
4-class model family prediction task. The model is then trained for all tasks, albeit only some losses are
propagated to the base model to prevent overfitting. The results of the Voight-Kampf classification
head are reported for the competition.</p>
        <p>Yang and Yan [28] fine-tune a BERT model using genre-dependent contrastive loss. For this, the
CLS output token is concatenated with a learned genre vector, which is then fed into an MLP with
contrastive loss.</p>
        <p>Zaidi et al. [29] fine-tune an uncased BERT-base for binary classification of human and LLM texts.</p>
        <p>Team</p>
      </sec>
      <sec id="sec-3-4">
        <title>3.4. Evaluation</title>
        <p>The final system ranking based on the PAN and ELOQUENT test sets is listed in Table 5. Participating
teams that submitted more than one system are ranked only with their best-performing system.</p>
        <p>All systems were submitted and evaluated on Tira [7]. At test time, the participants had to calculate
a score between 0 and 1 for each text, indicating the likelihood that the text was LLM-generated. A
score of exactly 0.5 could be given to signal a non-decision.</p>
        <sec id="sec-3-4-1">
          <title>3.4.1. Score Calculation</title>
          <p>For each participant, we computed a confusion matrix and the following scores, which we used in
previous authorship verification shared tasks as well:
• ROC-AUC: The area under the Receiver Operating Characteristic curve.
• Brier: The complement of the Brier score (mean squared loss)
• C@1: A modified accuracy score that assigns non-answers (score = 0.5) the average accuracy of
the remaining cases [48].
• F1: The harmonic mean of precision and recall.</p>
          <p>F1</p>
          <p>Macko
Valdez-Valenzuela</p>
          <p>Liu
Seeliger
Voznyuk</p>
          <p>Yang</p>
          <p>Zaidi
hello-world
Baseline TF-IDF SVM</p>
          <p>bohan-li
Marchitan</p>
          <p>Teja
xlbniu
Jimeno-Gonzalez</p>
          <p>Völpel</p>
          <p>Ochab
Ostrower</p>
          <p>Basani
Pudasaini</p>
          <p>Sun
Larson
Huang</p>
          <p>Titze
Baseline Binoculars Llama3.1</p>
          <p>Kumar
Baseline PPMd CBC</p>
          <p>Liang
0.4
0.5</p>
          <p>0.6 0.7 0.8
Mean Score Per Dataset</p>
          <p>• F0.5u: A modified F 0.5 measure (precision-weighted F measure) that treats non-answers (score =
0.5) as false negatives [49].</p>
          <p>• Mean: The arithmetic mean of all previous measures</p>
          <p>The six measures are calculated for each system on each of the test data subsets described in
Section 3.1.4. A final system score is calculated per measure as a macro average over all subsets. The
resulting macro mean score is used to determine the system’s ranking in the final leaderboard. The
mean scores for data subsets with only positive examples (i.e., the held-back OpenAI o1 and
DIPPERobfuscated Gutenberg subsets) is calculated without the contribution of ROC-AUC, as the value would
be undefined. The precision for the F 1 and F0.5u calculations is assumed to be 1.0 in those cases.</p>
        </sec>
        <sec id="sec-3-4-2">
          <title>3.4.2. Final Ranking and Discussion</title>
          <p>Table 5 shows the final macro scores per team and system. 2 If teams submitted multiple systems, only
the best-performing system is considered in the ranking. Of the 24 submitted systems, 8 beat the
strongest baseline (TF-IDF SVM) and 14 more beat the second-strongest baseline (Binoculars). Macko
[24] lead the ranking with Valdez-Valenzuela (no notebook) and Liu et al. [25] being the runners-up.</p>
          <p>Figure 4 shows the distribution of mean scores for each system over test data subsets from which
the macro means are calculated. Most systems range between 0.7 and 1.0 for most subsets, apart
2Note: Bevendorf et al. [16] and an earlier draft of this paper described significantly worse scores for all systems. This was due to a
score calculation error on the ELOQUENT data subset, which has been corrected in this version.</p>
          <p>Riddell-Juola (Obfuscated)
Brennan-Greenstadt (Obfuscated)</p>
          <p>ELOQUENT
PAN'24 Test
PAN'24 (o1)</p>
          <p>Riddell-Juola</p>
          <p>Gutenberg Fiction</p>
          <p>Brennan Greenstadt
Gutenberg Fiction (Obfuscated)</p>
          <p>Gutenberg Fiction (o1)</p>
          <p>Riddell-Juola (o1)</p>
          <p>Riddell-Juola</p>
          <p>PAN'24 Test
Riddell-Juola (Obfuscated)</p>
          <p>Brennan Greenstadt
tse ELOQUENT
taBrennan-Greenstadt (Obfuscated)
aD Gutenberg Fiction</p>
          <p>PAN'24 (o1)
Gutenberg Fiction (Obfuscated)</p>
          <p>Gutenberg Fiction (o1)</p>
          <p>Riddell-Juola (o1)
Brennan-Greenstadt (Obfuscated)</p>
          <p>Riddell-Juola (Obfuscated)</p>
          <p>ELOQUENT</p>
          <p>PAN'24 (o1)
tse Gutenberg Fiction
ta Gutenberg Fiction (Obfuscated)
aD PAN'24 Test</p>
          <p>Gutenberg Fiction (o1)</p>
          <p>Riddell-Juola
Brennan Greenstadt</p>
          <p>Riddell-Juola (o1)
0.0
0.0
0.2 0.4
1.0 – C@1: All Systems
from certain outliers. On the ELOQUENT subset, the majority of systems achieve scores between 0.7
and 0.8, dragging down their final scores. The best system by Macko [24] is an exception to this rule by
displaying more than solid performance also on ELOQUENT. The mean false positive rate of all systems
on ELOQUENT’s human texts is 0.15, which is probably best explained by the topic domain shift from
the PAN training data. Cross-domain classification has been a longstanding problem in authorship
identification [50], but also in LLM detection in particular [51].</p>
          <p>If we analyze the distribution C@1 scores per subset (Figure 5), we can see the obfuscated
BrennanGreenstadt and Riddell-Juola and ELOQUENT subsets seem to be the most dificult. However, if we
look at false positive and false negative rates separately, we can find this efect only in the latter. That
means LLMs were more successful than humans in obfuscating their distinct “LLM style,” whereas the
obfuscated human texts still look human enough. This is not entirely unexpected, since the human
authors were only asked to mask their personal style, not to “write like a machine” (and LLMs did not
exist back then, anyway). The LLMs, on the other hand, apparently managed to deviate suficiently from
their usual style that makes them otherwise stand out. This efect, however, must also be attributed in
large part to the other obfuscations we applied. Due to the way the test set was sampled, only parts of
these subsets are generated from the plain neighborhood prompt (57 % for Brennan-Greenstadt, 18 %
for Riddell-Juola). Hence, further analysis is needed to distinguish between these two efects. Since
these obfuscations are in part also purely prompt-based, we can assume, though, that LLMs did manage
to change their style suficiently. Considering that the human texts were classified correctly for the
most part, the topic / domain shift alone cannot serve as an explanation.</p>
          <p>The Gutenberg data, on the other hand, seems to be the easiest, which may indeed be a result of the
text homogeneity between training and test. The held-back OpenAI o1 texts also posed no particular
challenge. This is in contrast to our previous findings [ 5]. This is even true for the TF-IDF SVM baseline,
which is identical to the detector we used in those earlier experiments. This peculiarity definitely
warrants more research. Perhaps the inclusion of GPT-4.5 texts in the training data played a role, but
perhaps the included sample of only 300 texts was too small.</p>
          <p>Figure 6 shows the efectiveness of individual obfuscations (not including the plain neighborhood
prompt) described in Section 3.1.3. DIPPER paraphrasing appears to be highly efective in avoiding
detection, but this comes at a heavy price with regard to text quality. Hence, this result should be taken
with a grain of salt. The second-most efective obfuscation appears to be the prompt to write like a
7-year-old. This obfuscation uses only the prompt to change the model output, which appears to be
suficient to successfully avoid detection in most cases. Prompting the model to change the word order
to subject-object-verb shows a medium efectiveness. All three are even more efective if tested against
only the top-10 systems, hinting at overfitting. Increasing the temperature, prompting for alliterations,
or inserting random words into the prompt did not have any significant impact.</p>
          <p>The overall best performing system by Macko [24] had a false-negative rate of 42 % on
DIPPERparaphrased texts, but was entirely unafected by all other obfuscations.</p>
        </sec>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. PAN Subtask 2: Human-AI Collaboration</title>
      <p>The rise of AI-assisted writing is transforming the traditional notion of authorship and steadily blurring
the boundary between human and machine contributions. As AI involvement now spans the full
spectrum—from none to complete [52], new questions of ethical and intellectual accountability arise.
Subtask 2 seeks to address these challenges by asking the participants to classify a collaboratively
authored human-AI document into one of six categories, defined from an ethical and intellectual
accountability perspective:
i. fully human-written;
ii. human-written, then machine-polished;
iii. machine-written, then machine-humanized (obfuscated);
iv. human-initiated, then machine-continued;
v. deeply mixed text, where some parts are written by a human and some are generated by a
machine;
vi. machine-written, then human-edited.</p>
      <sec id="sec-4-1">
        <title>4.1. Dataset</title>
        <p>The training and validation sets for Subtask 2 were constructed from existing datasets focused on
ifne-grained machine-generated text detection. We provided 288,918 examples for training and 72,661
for validation. For testing, we collected four new datasets, with an emphasis on student essays and
peer reviews. We additionally incorporated five recently released datasets to comprehensively evaluate
the generalization of detection systems across unseen generators and domains. The test set consists of
140,756 instances. The dataset statistics, including the distribution across six categories, are shown in
Table 6. All subsets underwent the cleaning procedure: we removed duplicates and filtered out texts
shorter than 30 characters. Below, we describe each component dataset in detail, with an overview
provided in Table 7.</p>
        <sec id="sec-4-1-1">
          <title>4.1.1. Training and Development Sets</title>
          <p>MixSet [53] includes machine-polished human-written texts and human-edited machine-generated
texts. We sampled 3,491 out of 3,600 texts, ensuring a minimum text length of 30 characters.
LLM-DetectAIve [54] is a large-scale benchmark designed to improve machine-generated text
detection across diferent domains and text variations. The dataset is based on M4GT [ 55], which
includes a mix of human-written and machine-generated texts from sources such as arXiv, WikiHow,
Wikipedia, Reddit, student essays (OUTFOX), and peer reviews (PeerRead). LLM-DetectAIve extends
the M4GT dataset by adding new machine-generated texts from more advanced models (e.g., GPT-4o),
machine-humanized MGTs, and human-written texts that were polished using LLMs. The resulting
dataset comprises 91,358 fully machine-generated texts, 103,852 machine-humanized texts, and 107,900
human-polished texts. LLM-DetectAIve includes outputs from a diverse set of LLMs, including
Llama38B/70B, Mixtral-8x7B, Gemma-7B, Gemma-2-9B, GPT-4o, Gemini-1.5-Pro, and Mistral-7B. For Subtask
2, we excluded the fully machine-generated texts and used 286,169 examples from the remaining two
categories.</p>
          <p>RoFT [56] is a collection of over 21k human annotations paired with error classifications to investigate
how various variables such as model size, decoding strategy, and fine-tuning afect human detection
performance. The dataset is in English and contains domains like recipes, presidential speeches, short
stories, and New York Times. Each instance is initiated by a human and then continued by either
a human or an LLM including GPT-2 [68], GPT-3 [69] and CTRL [70]. We preprocess the data by
performing basic cleaning, removing duplicates and sanity entries. This results in a total of 9,148
instances that are either fully human-written or human-initiated and machine-continued.
RoFT-ChatGPT [57] is an augmented version of the Real or Fake Text (RoFT) dataset. The augmented
version includes GPT-3.5-turbo generations. The dataset is entirely in English and covers domains such
as short stories, recipes, New York Times news articles, and presidential speeches. It consists of 6,940
samples of text that are human-initiated and machine-continued, ofering a rich resource for evaluating
the interaction between human authorship and AI-generated text.
Subtask 2 Component Datasets used for training, development and test sets. Size refers to the number of
examples we sampled from a given dataset, rather than the full set of original dataset. In addition to leveraging
existing datasets, we collected three datasets based on ICNALE, NLPeer and Peersum, and we initiated a dataset
MBZUAI-CLEF based on our real-life usage.</p>
          <p>Training and Development Sets
Dataset
Co-author [58] detected sentence-level boundaries of human-AI collaborative mixed texts. They
identified challenges in detecting AI-generated sentences in mixed texts, such as human writers’
selecting and even editing AI-generated sentences based on personal preferences; the frequent change of
authorship between neighboring sentences within the mixed text; and the short length of text segments
within mixed texts, which provides on limited stylistic cues for reliable authorship determination.</p>
          <p>Coauthor comprises 1,447 writing sessions produced through the collaboration of human writers,
recruited from Amazon Mechanical Turk, and a GPT-3 assistant [69]. These sessions encompass both
creative and argumentative writing. Writers were provided with a prompt as a starting point and
could either compose independently or request sentence suggestions from the assistant. The suggested
sentences could be accepted or rejected, and this iterative process continued until the essay was
completed. This setup results in a highly mixed text, where some parts are authored by humans and
others are generated by the machine.</p>
          <p>TriBERT [59] is a dataset of human-AI collaborative writing. First, the authors collected the
humanwritten essays from U.S.junior high school students (grades 7-10). They then designed 8 text modification
tasks, each involving the removal of specific segments from the original human-written texts, followed
by LLM-generated fill-in. This approach resulted in 8 distinct forms of human-AI hybrid writing. In our
six-label space, texts that start with a human-written introduction and then were completed by LLMs
are mapped as human initiated, machine continued, and all other cases are classified as deeply mixed.
This transformation doubles, resulting in 34,272 texts, and then we sampled 21,178.
LAMP [60] consists of 1,057 paragraphs generated by LLMs and subsequently edited by professional
writers. The original paragraphs were sourced from reputable publication venues, including The New
Yorker, The New York Times, and Dear Sugar. These paragraphs cover various domains, such as fiction,
food writing, and internet advice. In total, approximately 1,200 paragraphs were selected, with the
Literary Fiction genre accounting for 80% and the remaining 20% categorized as Creative Non-Fiction.</p>
          <p>The instructions for generating these paragraphs were created using instruction back-translation
[71]. In this process, GPT-4o was prompted to summarize each paragraph into writing instructions.
After manual verification, ill-formed or overly specific instructions were filtered out, resulting in a total
of 1,057 high-quality instructions. These instructions were then used to generate additional writings
using GPT-4o, Claude-3.5-Sonnet, and Llama 3.1-70b. Finally, the LLM-generated responses were further
refined and edited by a team of 18 professional writers, following a comprehensive edit taxonomy
informed by expert writing practices.
4.1.2. Test Set
Beemo [61] Benchmark of Expert-edited Machine-generated Outputs (Beemo) is a dataset designed to
support fine-grained detection of machine-generated text (MGT), particularly in multi-author scenarios
where LLM outputs are refined by either human experts or other LLMs. It comprises a total of 19.6k
English texts spanning five tasks: open-ended generation, rewriting, summarization, and open/closed
QA. We used 17,331 examples, removing the purely machine-generated texts. Specifically, this includes
2,184 human-written texts, 2,183 machine-written, then human-edited texts, with the rest as
machinewritten, then machine-humanized.</p>
          <p>
            AI-Polished-Text Evaluation (APT-Eval) Based on the 300 purely human-written texts sampled
from MixSet [53], Saha and Feizi [62] collected 11.7K machine-polished data using four diferent models:
GPT-4o, Llama3.1-70B, Llama3-8B, and Llama2-7B with two polishing strategies. For degree-based
polishing, the LLM is prompted to refine the text in four varying degrees of modification (
            <xref ref-type="bibr" rid="ref1">1</xref>
            )
extremelyminor, (
            <xref ref-type="bibr" rid="ref2">2</xref>
            ) minor, (
            <xref ref-type="bibr" rid="ref3">3</xref>
            ) slightly-major, and (
            <xref ref-type="bibr" rid="ref4">4</xref>
            ) major. For percentage-based polishing, the LLM is instructed
to modify a fixed percentage (p%) of words in a given text. The percentage is systematically varied
across the following values: p% = 1, 5, 10, 20, 35, 50, 75%. They find that detectors have a higher
misclassification rate over smaller and older generators.
          </p>
          <p>HART [63] is a dataset for fine-grained machine-generated text detection, including categories
of purely human-written, human-written and then machine-polished, machine-generated and then
machine-humanized, purely machine-generated text. There are 16K English examples, and 4K examples
for the other four languages including Chinese, Arabic, French, and Spanish. Six LLM generators were
involved in data collection, i.e., GPT-3.5-turbo, GPT-4o, Claude-3.5-Sonnet, Gemini-1.5-pro,
Llama-3.370B-instruct, and Qwen-2.5-72B-instruct. They additionally gathered 250 machine-generated
humanedited texts. This dataset can be used as part of our test set.</p>
          <p>LLMDetect [64] is designed to distinguish between four categories of text origin: Human-Author
(fully human-written), LLM-Creator (entirely machine-generated), LLM-Polisher (human-written text
subsequently refined by an LLM), and LLM-Extender (human-written text extended or expanded by an
LLM). LLMDetect includes the Hybrid News Detection Corpus (HNDC) for training detectors, as well
as DetectEval, a comprehensive evaluation suite that considers five distinct cross-context variations
and two multi-intensity variations within the same LLM role. This allows for a thorough assessment of
detectors’ generalization and robustness across diverse contexts.
academic paper/abstract writing, paper summary, peer review, rebuttal,
Email writing, administrative letters, recommendation requests, ethical
response letters, commitment letters
PhD SOP review, teaching statements, job interview preparation, oral
interview explanations, volunteer applications
Story writing, daily chatting, continued writing
GitHub issues, README files, report writing, NLP slides summary</p>
          <p>
            Prompt refining
ALTA 2024 Shared Task Mollá et al. [65] employ four distinct construction patterns to organize
human and machine-generated sentences: (
            <xref ref-type="bibr" rid="ref1">1</xref>
            ) human-written sentences followed by machine-generated
sentences; (
            <xref ref-type="bibr" rid="ref2">2</xref>
            ) machine-generated sentences followed by human-written sentences; (
            <xref ref-type="bibr" rid="ref3">3</xref>
            ) human-written
sentences, then machine-generated sentences, followed again by human-written sentences; and (
            <xref ref-type="bibr" rid="ref4">4</xref>
            )
machine-generated sentences, then human-written sentences, followed by machine-generated sentences.
          </p>
        </sec>
        <sec id="sec-4-1-2">
          <title>4.1.3. Our Datasets</title>
          <p>We collected three datasets based on ICNALE3, Peersum, and NLPeer, and we further gathered a dataset
from scratch based on authors’ daily usage of LLMs.</p>
        </sec>
        <sec id="sec-4-1-3">
          <title>International Corpus Network of Asian Learners of English (ICNALE) is an international learner</title>
          <p>corpus. We sampled 5,843 topic-controlled essays produced by more than 5,500 college students (incl.
grad students) in ten countries/regions in Asia (China, Hong Kong, Taiwan, Indonesia, Japan, Korea,
Pakistan, Philippines, Singapore, Malaysia, and Thailand), as well as English native speakers. Based
on these human-written essays, we further generated 2,624 human-written, then machine-polished,
1,000 machine-written, then machine-humanized, and 656 human-initiated, then machine-continued
text, using the latest SOTA LLM including GPT-4.1, Qwen3-30B-a3b, Llama-3.3-70B, Deepseek-chat-v3,
Gemini-2.5-pro-preview, resulting in a total of 10,123 examples.</p>
          <p>
            Peersum [66] is a dataset designed for generating meta-reviews of scientific papers based on reviews.
The meta-reviews can be interpreted as abstractive summaries of reviews, multi-turn discussions
among reviewers and the paper author, and the paper abstract. We sampled 1,000 human-written
metareviews and the corresponding reviews. We used GPT-4.1-2025-04-14, GPT-4.1-mini-2025-04-16, and
Deepseek-R1 to further produce 2,000 human-written, then machine-polished, 2,000 human-initiated,
then machine-continued, and 270 machine-written, then machine-humanized cases, in total of 5,270.
NLPeer [67] is intended for the study of peer review and approaches to NLP-based assistance to peer
review. Based on human-written reviews, we used GPT-4.1 and GPT-4.1-mini for human-written, then
machine-polished (
            <xref ref-type="bibr" rid="ref2">2,993</xref>
            ). Based on either paper abstracts or the full papers, we first applied GPT-4o to
generate reviews, and used GPT-4.1 and GPT-4.1-mini to humanize (
            <xref ref-type="bibr" rid="ref3">3,000</xref>
            ).
          </p>
          <p>MBZUAI-CLEF is a dataset we collected from scratch. The human-written texts cover six categories
including academic writing, professional correspondence, application materials, creative writing,
technical and support writing, and prompting engineering, with 378 examples in total, as shown in Table 8.
These human-written drafts were polished using GPT-4o, after which the authors further edited the
polished outputs to adapt them to real-world applications.</p>
        </sec>
      </sec>
      <sec id="sec-4-2">
        <title>4.2. Task Organization</title>
        <p>The Subtask 2 was conducted in two phases:
Development Phase. Only training and development data were provided to participants, with gold
labels available for the development set. Although they were not allowed to use external training and
validation data, data augmentation strategies such as back-translation, synonym replacement, random
word deletion, and replacement were allowed.</p>
        <p>Participants competed against each other to achieve the best performance on the development set.
A live leaderboard on CodaLab was made available to track all submissions.4 Teams could make an
unlimited number of submissions, and the best score for each team, regardless of the submission time,
was displayed in real time on CodaLab.</p>
        <p>Test Phase. Participants were given approximately one week to prepare their predictions. Then,
participant teams submitted their results to CodaLab.5 They could submit multiple runs, but they would
not receive feedback on their performance. Only the latest submission from each team was considered
“oficial,” and was used for the final team ranking. In total, 22 teams submitted results, of which 16
submitted system description papers. After the competition concluded, we released the gold labels for
the test set. Furthermore, we kept the submission system open for the test dataset for post-shared task
evaluations and to monitor the state of the art.</p>
      </sec>
      <sec id="sec-4-3">
        <title>4.3. Evaluation</title>
        <p>
          Predictions of all systems were submitted and evaluated in CodaLab. At the test time, participants
assigned the predicted label among [0, 1, 2, 3, 4, 5] for each text, indicating its category. Participants
in the leaderboard were ranked by macro-recall. Macro-recall is selected as the primary evaluation
metric for two reasons: (
          <xref ref-type="bibr" rid="ref1">1</xref>
          ) it gives equal importance to each class, preventing performance for majority
classes from dominating the overall score on an unbalanced test set; and (
          <xref ref-type="bibr" rid="ref2">2</xref>
          ) macro-recall provides a
more focused view on the model’s ability to capture all positive instances for every class, compared
with macro-F1 balancing precision and recall for each class. As additional evaluation metrics, we also
computed accuracy and macro-F1.
        </p>
      </sec>
      <sec id="sec-4-4">
        <title>4.4. Baseline</title>
        <p>To establish a baseline, we fine-tuned a pre-trained transformer-based model RoBERTa on the training
set. Fine-tuning was performed using the Hugging Face Trainer API with the following configuration:
learning rate of 2 × 10− 5, batch size of 16 for both training and evaluation, weight decay of 0.1, and a
total of 3 training epochs. Checkpoints were evaluated at the end of each epoch, and the best-performing
model on the development set was retained for subsequent testing. The baseline achieved a macro-recall
of 68.67 % on the development set, with corresponding macro-F1 and accuracy scores of 61.26 % and
56.71 %, respectively.
4.5. Submissions
22 teams submitted their predictions to CodaLab, of which 16 submitted notebook papers [24, 26, 30, 29,
27, 31, 72, 76, 77, 74, 79, 81, 78, 75, 73, 80]. The performance of 14 teams is above the baseline, and 8
teams are below fine-tuned RoBERTa-base, as shown in Table 9. One team submitted their prediction
ifle after the deadline, which we marked in gray. Additionally, four teams submitted files with IDs
misaligned with the test set. The test set contains IDs ranging from 0 to 141,410, with 655 entries filtered
out, rather than forming a continuous range from 0 to 140,755. For fairness, we ranked all submissions
based on their original versions. However, we also corrected the misaligned IDs and re-evaluated the
4https://codalab.lisn.upsaclay.fr/competitions/22620
5https://codalab.lisn.upsaclay.fr/competitions/22934</p>
        <p>Team
QLoRa PEFT fine-tuned Qwen3-4B-Base.</p>
        <p>Under-sample high-frequency classes and adopt data
augmentation for underrepresented classes, along with
R-Drop regularization for DeBERTa-v3-base fine-tuning.
–
Shared Transformer Encoder between several classification
heads trained to distinguish the domains.</p>
        <p>Combine the deep language understanding of
DeBERTa-v3-large and the high-dimensional mapping ability
of StarBlock2d.</p>
        <p>DeBERTa enhanced by contextual and geometric attention
Use DeBERTa-v3-Large
Fine-tune Gemma-2 2B for sequence classification with
multiple classification heads.
–
Fine-tune DeBERTa-V3-Large and combining multi-scale
features.
–
Combine the contextual strength of BERT with the sequence
modeling capabilities of Transformer layers.</p>
        <p>Fine-tune DeBERTa-V3-Large and combine it with BiLSTM
and attention mechanism.</p>
        <p>Soft and Hard Mixture of Experts (MoE) architectures with
DeBERTa-V3-Large
–
Cumulative sum of token-Level correlation signals
–
Fine-tune RoBERTa
Fine-tune RoBERTa with class balancing, data augmentation,
and calculation of specific weights for each unbalanced class.
–
Combine features at diferent layers extracted using
Transformers with layer-wise projection and attentive pooling.</p>
        <p>Fine-tune transformer models with data augmentation
strategies on underrepresented classes (late submission).
–
–
–
–
–
F1
afected submissions to better reflect the actual performance of their detection systems. The following
analysis is based on these corrected results.</p>
        <p>Many teams fine-tuned DeBERTa-v3-large and achieved better results than RoBERTa. Larger language
models such as Qwen-3 4B and Gemma-2 2B were superior to DeBERTa and RoBERTa, see more in
Table 10. The performance drop observed on the test set compared to the development set highlights
the need for further improvement in fine-grained human-AI collaborative text detection.</p>
        <p>M
L
P
l
l
a
m
S</p>
        <p>M
L
P
e
g
r
a</p>
        <p>L
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
✓
g
n
i
n
u
t
e
n
i
F
✓
.
g
u
A
a
t
a</p>
        <p>D
✓
✓
✓
✓
✓
.
b
m
o
C
s
e
r
u
t
a
e
F
✓
e
l
b
m
e
s
n
E
✓
✓
s
d
a
e
H
.l
t
u
M
Qwen3-4B (QLoRA PEFT)
DeBERTa-v3-base (R-Drop)
DeBERTa-base (Multi-task)
DeBERTa-V3-Large (Hybrid)
DeBERTa-V3-Large (Geom. Attn)
DeBERTa-V3-Large
Gemma-2 2B (Multi-head)
DeBERTa-V3-Large (Multi-scale)
BERT_T
DeBERTa-V3-Large + BiLSTM
DeBERTa-V3-Large (MoE)
Token-level correlation signals
RoBERTa-base (Class weight)
RoBERTa-base (Layer proj.)
RoBERTa-large
BERT-base (Sent. segment)</p>
        <sec id="sec-4-4-1">
          <title>4.5.1. System Overview</title>
          <p>The highest-ranking system, mdok [24], fine-tuned the Qwen3-4B-Base model using QLoRA for eficient
parameter adaptation. In comparison, Bohan Li [72] used DeBERTa-v3-base enhanced with data
augmentation, balancing, and R-Drop regularization, showing strong performance despite a smaller
backbone.</p>
          <p>Several teams built upon DeBERTa-V3-Large for human-AI collaborative writing detection.
StarBERT [73], Atu [74], TaoLi [75], and Zhangzhiliang [79] all fine-tuned this model, each introducing
unique extensions: StarBERT proposed a hybrid classification framework, Atu incorporated contextual
and geometric attention, TaoLi focused on fine-grained categorization, and Zhangzhiliang added a
BiLSTM layer with attention mechanisms. Advacheck [27] also used DeBERTa-base within a multi-task
learning framework to distinguish between human, machine, and hybrid authorship styles.</p>
          <p>DetectTeam [77] enhanced DeBERTa-V3-Large with a Feature Pyramid Network to capture
multiscale features, while CNLP-NITS-PP [31] employed the Mixture-of-Experts (MoE) architectures with
SoftMoE and HardMoE for dynamic model routing.</p>
          <p>Other teams opted for RoBERTa-based architectures. VerbaNex [80] fine-tuned RoBERTa-base and
addressed data imbalance using augmentation, oversampling, undersampling, and loss weighting.
Nexus [29] fine-tuned RoBERTa-Large, applying targeted data augmentation techniques including
back-translation, synonym/antonym substitution, and random deletion.</p>
          <p>ReText.Ai [76] adopted a multi-head classification strategy over the Gemma-2 2B model, while
WeiDongWu [78] introduced BertT, a hybrid model combining BERT-base with an extra Transformer
encoder and dropout layers for six-class collaborative classification.</p>
          <p>Unibuc-NLP [30] merged transformer layers using projection and attentive pooling. Steely [26]
converted text into interpretable word-level correlation signals. Fuchuan [81] applied a BERT-based
model for sentence-level classification on segmented inputs.</p>
          <p>0
1</p>
          <p>2 3
Predicted label</p>
          <p>4</p>
          <p>Confusion Matrix of lza
le2 678 6158 16330 75
b
a
lloG3 684 4544 5941 10926 625
d</p>
          <p>336 1657
5 209 1090 1007 10
0
1</p>
          <p>2 3
Predicted label
93
4
61
5
82
148
5
0 25954 6135 1223 74
387 736
0 25840 6124 1371 73
386 715
1 2966 33975 5087 31
4 100 279 514 191 11411 5</p>
          <p>4 100 278 518 191 11408 5
le2 599 5965 17217 125 275 1053
b
a
llodG3 417 1827 3014 16761 691 92
5000
30000
25000
20000
15000
10000
5000
25000
20000
15000
10000
5000
30000
25000
20000
15000
10000
5000</p>
        </sec>
        <sec id="sec-4-4-2">
          <title>4.5.2. Recall Across Labels</title>
          <p>We show the macro-averaged recall and label-wise recall scores of 22 submissions for Subtask 2 in
Table 11. A clear trend emerges in terms of label dificulty. Label 4 (deeply-mixed text) consistently
receives the highest recall across systems, indicating it is the easiest class to detect. This is followed
by label 1 (human-written, then machine-polished) and label 0 (purely human-written), which also
achieve high recall scores for most systems. In contrast, label 5 (machine-written, then human-edited)
proves to be the most challenging, with most systems performing poorly on this category, and only a
few submissions (e.g., team 7) achieving a recall above 9.0 %.</p>
          <p>This trend partially aligns with the label distribution in the training data. The easiest labels 1
(humanwritten, then machine-polished) and 0 (purely human-written) are among those with the highest number
of training examples. In contrast, labels 5 (machine-written, then human-edited) and 3 (human-initiated,
then machine-continued) are the most challenging to detect, which also corresponds to their small
training sizes. While label 2 (machine-written, then machine-humanized) has a comparable amount of
training data to label 1 (91k vs. 95k), its performance is significantly worse across most systems.</p>
          <p>The confusion matrices in Figure 7 further confirm this pattern. For all top-performing systems
(Teams 1–3 and lza), label 2 is frequently misclassified as 1. This suggests that machine-humanized
texts (label 2) share stylistic cues with machine-polished texts (label 1) likely due to the dominating
influence of machine-generated text in both. Conversely, label 5 exhibits low recall and high confusion
with label 2, indicating that human edits often obscure the origin of machine-generated text, making it
particularly dificult to detect.</p>
          <p>Taken together, these findings underscore the nuanced dificulty of distinguishing between varying
degrees of human-AI collaboration. While abundant data helps, the boundary between polished and
humanized content remains inherently fuzzy, calling for more fine-grained modeling approaches beyond
token-level patterns or surface style cues.</p>
        </sec>
        <sec id="sec-4-4-3">
          <title>Beemo APT-Eval HART LLMDetect ALTA ICNALE Peersum NLPeer MBZUAI-CLEF Rank</title>
        </sec>
        <sec id="sec-4-4-4">
          <title>4.5.3. Accuracy Across Datasets</title>
          <p>The system performance across the nine component datasets in the testbed is shown in Table 12. The
top-ranked systems achieve the highest accuracies on datasets such as NLPeer (peer review), ALTA
(essay), and ICNALE (essay), which consist primarily of academic and well-structured text genres.
These datasets likely provide clearer linguistic signals and stylistic features for distinguishing diferent
human-AI collaborative texts. For instance, the best-performing system achieves 90.97 % accuracy on
ALTA and 96.90 % on NLPeer. These results suggest that model generalization is strongest in formal
writing settings, particularly in educational and peer-review contexts.</p>
          <p>In contrast, performance drops significantly on datasets such as MBZUAI-CLEF, Peersum, and Beemo,
which contain more diverse genres and informal domains. MBZUAI-CLEF, for example, covers a wide
range of real-world writing scenarios, including rebuttals, admin letters, emails, oral presentations, and
GitHub README files, but consists of only 1,115 examples. All systems perform poorly on this dataset,
with accuracies below 48 %, highlighting the dificulty of detecting fine-grained signals in low-resource,
noisy, or mixed-genre settings. Similarly, Peersum and Beemo include peer review and open-domain
generation tasks, both of which challenge the systems due to variability in style, structure, and intent.</p>
          <p>We also observe that datasets with larger size and consistent formatting, such as LLMDetect (48k
examples), HART (24k), and ALTA (16.5k), tend to support more stable performance across submissions.
This indicates that both data volume and stylistic consistency facilitate more efective fine-tuning
and generalization. The wide variance in performance across component datasets underscores the
importance of robustness and adaptability in collaborative text detection. Future work should prioritize
developing detection systems capable of generalizing across low-resource, diverse, and informal domains,
where style-based cues may be weaker or inconsistent.</p>
        </sec>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>5. Conclusion</title>
      <p>The PAN Subtask 1 received submissions from 24 teams, of which 21 also submitted a work notebook.
Most of the submitted systems were quite strong, with the best system being almost perfect on the PAN
test data. However, unlike in 2024, the texts submitted by ELOQUENT participants posed a dificult
challenge for PAN systems. In part, this can be explained by the topic shift between the ELOQUENT
dataset and the training data provided by PAN, but (almost) all systems also struggled with certain
obfuscations applied to the texts, including ones that relied solely on prompting techniques without
changing the actual topic.</p>
      <p>The PAN Subtask 2 introduced a nuanced, six-category classification of human-AI collaboration,
reflecting the complex reality of modern text production and arising ethical and intellectual
accountability challenges. While participants’ systems could distinguish certain categories like “deeply-mixed text”
with high recall, they struggled significantly to identify “machine-written, then human-edited” texts—a
crucial and challenging real-world scenario. Performance was heavily influenced by the distribution of
training data and the domain of the text, with systems performing better on structured academic writing
than on more diverse, informal datasets. The dificulty of distinguishing between machine-polished and
machine-humanized text further underscores the fuzzy boundaries in collaborative writing. Together,
these findings indicate that while progress has been made, the reliable detection of AI authorship,
especially in its more subtle and varied forms, remains an open and pressing challenge.</p>
    </sec>
    <sec id="sec-6">
      <title>Acknowledgments</title>
      <p>The work of Janek Bevendorf, Matti Wiegmann, Maik Fröbe, Martin Potthast, and Benno Stein on
PAN subtask 1 has been funded as part of the OpenWebSearch project by the European Commission
(OpenWebSearch.eu, GA 101070014).</p>
    </sec>
    <sec id="sec-7">
      <title>Declaration on Generative AI</title>
      <p>Text, datasets, experiments, and analyses in this paper were created and conducted by the authors
themselves. Generative AI was used by some authors for assistance in the writing process, but no
substantial parts were generated by it.
G. Puccetti, T. Arnold, SemEval-2024 Task 8: Multidomain, Multimodel and Multilingual
MachineGenerated Text Detection, in: Proceedings of the 18th International Workshop on Semantic
Evaluation (SemEval-2024), 2024, pp. 2057–2079.
[9] Y. Wang, A. Shelmanov, J. Mansurov, A. Tsvigun, V. Mikhailov, R. Xing, Z. Xie, J. Geng, G. Puccetti,
E. Artemova, J. Su, M. N. Ta, M. Abassy, K. A. Elozeiri, S. E. D. A. El Etter, M. Goloburda, T. Mahmoud,
R. V. Tomar, N. Laiyk, O. M. Afzal, R. Koike, M. Kaneko, A. F. Aji, N. Habash, I. Gurevych, P. Nakov,
GenAI Content Detection Task 1: English and Multilingual Machine-Generated Text Detection: AI
vs. Human, in: F. Alam, P. Nakov, N. Habash, I. Gurevych, S. Chowdhury, A. Shelmanov, Y. Wang,
E. Artemova, M. Kutlu, G. Mikros (Eds.), Proceedings of the 1stWorkshop on GenAI Content
Detection (GenAIDetect), International Conference on Computational Linguistics, Abu Dhabi,
UAE, 2025, pp. 244–261. URL: https://aclanthology.org/2025.genaidetect-1.27/.
[10] S. Saha, R. Das, D. Das, JUNLP_SS at ELOQUENT Lab 2025: Humanizing AI - Enhancing the
Realism of Machine Generated Text, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), 26th
Working Notes of the Conference and Labs of the Evaluation Forum, CLEF 2025, CEUR-WS, 2025.
[11] A. Creo, M. Hormazábal-Lagos, H. Cerezo-Costas, P. Alonso-Doval, HumanAIzers in
VoightKampf at ELOQUENT 2025, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), 26th Working
Notes of the Conference and Labs of the Evaluation Forum, CLEF 2025, CEUR-WS, 2025.
[12] P. Vachharajani, Literal Re-translation as a Method for AI Text Disguise and Detection Evasion,
in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), 26th Working Notes of the Conference and Labs
of the Evaluation Forum, CLEF 2025, CEUR-WS, 2025.
[13] R. R. Gunti, The Data-Centric Approach for the Voight Kampf Task, in: G. Faggioli, N. Ferro,
P. Rosso, D. Spina (Eds.), 26th Working Notes of the Conference and Labs of the Evaluation Forum,
CLEF 2025, CEUR-WS, 2025.
[14] M. Hoveyda, Bypassing Human/Machine Classifiers by Prompting LLMs for Naturally Imperfect
Text, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), 26th Working Notes of the Conference and
Labs of the Evaluation Forum, CLEF 2025, CEUR-WS, 2025.
[15] A. A. Ayele, N. Babakov, J. Bevendorf, X. B. Casals, B. Chulvi, D. Dementieva, A. Elnagar, D. Freitag,
M. Fröbe, D. Korenčić, M. Mayerl, D. Moskovskiy, A. Mukherjee, A. Panchenko, M. Potthast,
F. Rangel, N. Rizwan, P. Rosso, F. Schneider, A. Smirnova, E. Stamatatos, E. Stakovskii, B. Stein,
M. Taulé, D. Ustalov, X. Wang, M. Wiegmann, S. M. Yimam, E. Zangerle, Overview of PAN 2024:
Multi-Author Writing Style Analysis, Multilingual Text Detoxification, Oppositional Thinking
Analysis, and Generative AI Authorship Verification, in: Experimental IR Meets Multilinguality,
Multimodality, and Interaction. 15th International Conference of the CLEF Association (CLEF
2024), volume 14959 of Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
2024, pp. 231–259. doi:10.1007/978-3-031-71908-0\_11.
[16] J. Bevendorf, D. Dementieva, M. Fröbe, B. Gipp, A. Greiner-Petter, J. Karlgren, M. Mayerl, P. Nakov,
A. Panchenko, M. Potthast, A. Shelmanov, E. Stamatatos, B. Stein, Y. Wang, M. Wiegmann,
E. Zangerle, Overview of PAN 2025: Voight-Kampf Generative AI Detection, Multilingual Text
Detoxification, Multi-Author Writing Style Analysis, and Generative Plagiarism Detection, in:
J. C. de Albornoz, J. Gonzalo, L. Plaza, A. G. S. de Herrera, J. Mothe, F. Piroi, P. Rosso, D. Spina,
G. Faggioli, N. Ferro (Eds.), Experimental IR Meets Multilinguality, Multimodality, and Interaction.
Proceedings of the Sixteenth International Conference of the CLEF Association (CLEF 2025),
Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York, 2025.
[17] J. Karlgren, E. Artemova, O. Bojar, V. Mikhailov, M. Sahlgren, E. Velldal, L. Øvrelid, ELOQUENT
CLEF shared tasks for evaluation of generative language model quality, 2025 edition, in: Advances
in Information Retrieval: 47th European Conference on Information Retrieval, ECIR 2025, Lucca,
Italy, April 6–10, 2025, Proceedings, Part V, Springer-Verlag, Berlin, Heidelberg, 2025, pp. 366–372.</p>
      <p>URL: https://doi.org/10.1007/978-3-031-88720-8_56. doi:10.1007/978-3-031-88720-8_56.
[18] M. Brennan, S. Afroz, R. Greenstadt, Adversarial stylometry: Circumventing authorship recognition
to preserve privacy and anonymity, ACM Transactions on Information and System Security 15
(2012). URL: http://dx.doi.org/10.1145/2382448.2382450. doi:10.1145/2382448.2382450.
[19] H. Wang, P. Juola, A. Riddell, Reproduction and replication of an adversarial stylometry experiment,
arXiv [cs.CL] (2022). URL: http://arxiv.org/abs/2208.07395. arXiv:2208.07395.
[20] K. Krishna, Y. Song, M. Karpinska, J. Wieting, M. Iyyer, Paraphrasing evades detectors of
AIgenerated text, but retrieval is an efective defense, in: A. Oh, T. Naumann, A. Globerson, K. Saenko,
M. Hardt, S. Levine (Eds.), Advances in Neural Information Processing Systems 36 (NeurIPS 2023),
volume 36, 2023, pp. 27469–27500. URL: https://proceedings.neurips.cc/paper_files/paper/2023/
hash/575c450013d0e99e4b0ecf82bd1afaa4-Abstract-Conference.html.
[21] A. Hans, A. Schwarzschild, V. Cherepanova, H. Kazemi, A. Saha, M. Goldblum, J. Geiping,
T. Goldstein, Spotting LLMs with Binoculars: Zero-shot detection of machine-generated
text, in: Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna,
Austria, July 21-27, 2024, volume abs/2401.12070, OpenReview.net, 2024, pp. 17519–17537. URL:
https://dl.acm.org/doi/10.5555/3692070.3692768. doi:10.48550/arXiv.2401.12070.
[22] D. Sculley, C. E. Brodley, Compression and machine learning: A new perspective on feature
space vectors, in: Data Compression Conference (DCC’06), IEEE, 2006, pp. 332–341. URL:
https://ieeexplore.ieee.org/abstract/document/1607268?casa_token=EwORidpcgTwAAAAA:
zONgvLu6aVgw-jrz0A-5JHXs-SdAljLqNXabhAQh6w685CRwAXXe7FxcD67SDkf6Ztfaj6AEwWA.
doi:10.1109/dcc.2006.13.
[23] O. Halvani, C. Winter, L. Graner, On the usefulness of compression models for authorship
verification, in: Proceedings of the 12th International Conference on Availability, Reliability
and Security, volume Part F1305, ACM, New York, NY, USA, 2017, pp. 54:1–54:10. URL: http:
//dx.doi.org/10.1145/3098954.3104050. doi:10.1145/3098954.3104050.
[24] D. Macko, mdok of KInIT: Robustly Fine-tuned LLM for Binary and Multiclass AI-Generated
Text Detection, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025
Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[25] J. Liu, L. Kong, Z. Peng, F. Chen, Generative AI Authorship Verification Based on
ContrastiveEnhanced Dual-Model Decision System, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working
Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[26] M. Seeliger, P. Styll, M. Staudinger, A. Hanbury, Human or Not? Light-Weight and Interpretable
Detection of AI-Generated Text, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes
of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[27] A. Voznyuk, G. Gritsai, A. Grabovoy, Team Advacheck at PAN: Multitasking Does All the Magic,
in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and
Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[28] J. Yang, K. Yan, Genre-Aware Contrastive Learning for AI Text Detection: A RoBERTa-Based
Approach, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025
Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[29] S. A. Zaidi, H. T. Ahmed, S. A. Akbar, Z. Shakeel, F. Alvi, A. Samad, Team Nexus Interrogators at
PAN: Voight-Kampf Generative AI Detection, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.),
Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[30] T.-G. Marchitan, C. Creanga, L. P. Dinu, Unibuc - NLP at “Voight-Kampf” Generative AI Detection
PAN 2025, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025
Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[31] L. Teja, A. Yadagiri, P. Pakray, Team CNLP-NITS-PP at PAN: Advancing Generative AI Detection:
Mixture of Experts with Transformer Models, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.),
Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[32] M. Jimeno-Gonzalez, E. Martínez-Cámara, P. G. Noelia Fernandez, L. A. U. na López, Team
SINAIINTA at PAN 2025: Uncovering machine generated text with linguistic features, in: Working
Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[33] F. Völpel, O. Halvani, Adept: AI-Generated Text Detection Based on Phrasal Category N-Grams,
in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and
Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[34] J. K. Ochab, M. Matias, T. Boba, T. Walkowiak, StylOch at PAN: Gradient-boosted trees with
frequency-based stylometric features, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working
Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[35] B. Ostrower, P. Doongare, M. T. Unnikrishnan, Binoculars, BART, and Adversaries: Multi-Faceted
AI Text Detection for PAN 2025, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes
of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[36] A. R. Basani, P.-Y. Chen, DivEye at PAN 2025: Diversity Boosts AI-Generated Text Detection, in:
G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs
of the Evaluation Forum, CEUR-WS.org, 2025.
[37] S. Pudasaini, L. Miralles-Pechuán, D. Lillis, M. L. Salvador, Enhancing AI Text Detection with
Frozen Pretrained Encoders and Ensemble Learning, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina
(Eds.), Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org,
2025.
[38] Y. Sun, S. Afanaseva, K. Stowe, K. Patil, Bi-directional Cross-entropy loss and Stylometric Feature
combined Classifier, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF
2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[39] J. Larson, Generative AI detection using simple Feature Selection and SVM, in: G. Faggioli,
N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[40] J. Huang, H. Cao, X. Lin, Z. Han, Application and Analysis of Roberta-base Model Fine Tuning
Based on Data Enhancement in AI Text Detection, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina
(Eds.), Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org,
2025.
[41] S. Titze, O. Halvani, LOG-AID: Logit-Based Statistical Features for AI Text Detection, in: G. Faggioli,
N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[42] R. Kumar, A. Trivedi, O. Varshney, Voight-Kampf AI Detection Sensitivity : IIITS@CLEF’25, in:
G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs
of the Evaluation Forum, CEUR-WS.org, 2025.
[43] Z. Liang, K. Sun, H. Cao, J. Luo, Z. Han, Research on Text Author Classification Based on
ModernBERT and Gradient Loss Function, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.),
Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[44] D. Macko, R. Moro, I. Srba, Increasing the robustness of the fine-tuned multilingual
machinegenerated text detectors, arXiv preprint arXiv:2503.15128 (2025).
[45] X. Hu, P.-Y. Chen, T.-Y. Ho, RADAR: Robust AI-text detection via adversarial learning, Neural
Information Processing Systems abs/2307.03838 (2023) 15077–15095. URL: https://proceedings.neurips.
cc/paper_files/paper/2023/hash/30e15e5941ae0cdab7ef58cc8d59a4ca-Abstract-Conference.html.
doi:10.48550/arXiv.2307.03838. arXiv:2307.03838.
[46] H. Guo, S. Cheng, X. Jin, Z. Zhang, K. Zhang, G. Tao, G. Shen, X. Zhang, Biscope: Ai-generated
text detection by checking memorization of preceding tokens, Advances in Neural Information
Processing Systems 37 (2024) 104065–104090.
[47] O. Halvani, Constituent Treelib - A Lightweight Python Library for Constructing, Processing,
and Visualizing Constituent Trees., 2024. URL: https://github.com/Halvani/constituent-treelib.
doi:10.5281/zenodo.10951644.
[48] A. Peñas, Á. Rodrigo, A Simple Measure to Assess Non-response, in: Proceedings of the 49th
Annual Meeting of the Association for Computational Linguistics: Human Language Technologies,
2011, pp. 1415–1424. URL: https://aclanthology.org/P11-1142.pdf.
[49] J. Bevendorf, B. Stein, M. Hagen, M. Potthast, Generalizing unmasking for short texts, in:
Proceedings of the 2019 Conference of the North, Association for Computational Linguistics,
Stroudsburg, PA, USA, 2019, pp. 654–659. URL: https://aclanthology.org/N19-1068.pdf. doi:10.
18653/v1/n19-1068.
[50] M. Kestemont, K. Luyckx, W. Daelemans, T. Crombez, Cross-Genre Authorship Verification Using
Unmasking, English Studies 93 (2012) 340–356. URL: http://dx.doi.org/10.1080/0013838X.2012.
668793. doi:10.1080/0013838X.2012.668793.
[51] Y. Wang, J. Mansurov, P. Ivanov, J. Su, A. Shelmanov, A. Tsvigun, C. Whitehouse, O. M. Afzal,
T. Mahmoud, T. Sasaki, T. Arnold, A. F. Aji, N. Habash, I. Gurevych, P. Nakov, M4: Multi-generator,
Multi-domain, and Multi-lingual Black-Box Machine-Generated Text Detection, in: Y. Graham,
M. Purver (Eds.), Proceedings of the 18th Conference of the European Chapter of the Association
for Computational Linguistics (Volume 1: Long Papers), Association for Computational Linguistics,
2024, pp. 1369–1407. URL: https://aclanthology.org/2024.eacl-long.83.pdf.
[52] J. Hutson, Human-ai collaboration in writing: A multidimensional framework for creative and
intellectual authorship, International Journal of Changes in Education (2025).
[53] Q. Zhang, C. Gao, D. Chen, Y. Huang, Y. Huang, Z. Sun, S. Zhang, W. Li, Z. Fu, Y. Wan,
L. Sun, LLM-as-a-coauthor: Can mixed human-written and machine-generated text be
detected?, in: K. Duh, H. Gomez, S. Bethard (Eds.), Findings of the Association for
Computational Linguistics: NAACL 2024, Association for Computational Linguistics, Mexico City,
Mexico, 2024, pp. 409–436. URL: https://aclanthology.org/2024.findings-naacl.29/. doi: 10.18653/v1/
2024.findings-naacl.29.
[54] M. Abassy, K. Elozeiri, A. Aziz, M. N. Ta, R. V. Tomar, B. Adhikari, S. E. D. Ahmed, Y. Wang,
O. Mohammed Afzal, Z. Xie, J. Mansurov, E. Artemova, V. Mikhailov, R. Xing, J. Geng, H. Iqbal, Z. M.
Mujahid, T. Mahmoud, A. Tsvigun, A. F. Aji, A. Shelmanov, N. Habash, I. Gurevych, P. Nakov,
LLMDetectAIve: a tool for fine-grained machine-generated text detection, in: D. I. Hernandez Farias,
T. Hope, M. Li (Eds.), Proceedings of the 2024 Conference on Empirical Methods in Natural
Language Processing: System Demonstrations, Association for Computational Linguistics, Miami,
Florida, USA, 2024, pp. 336–343. URL: https://aclanthology.org/2024.emnlp-demo.35.
[55] Y. Wang, J. Mansurov, P. Ivanov, J. Su, A. Shelmanov, A. Tsvigun, O. Mohammed Afzal, T. Mahmoud,
G. Puccetti, T. Arnold, A. Aji, N. Habash, I. Gurevych, P. Nakov, M4GT-bench: Evaluation
benchmark for black-box machine-generated text detection, in: L.-W. Ku, A. Martins, V. Srikumar
(Eds.), Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics
(Volume 1: Long Papers), Association for Computational Linguistics, Bangkok, Thailand, 2024, pp.
3964–3992. URL: https://aclanthology.org/2024.acl-long.218. doi:10.18653/v1/2024.acl-long.
218.
[56] L. Dugan, D. Ippolito, A. Kirubarajan, S. Shi, C. Callison-Burch, Real or fake text?: Investigating
human ability to detect boundaries between human-written and machine-generated text, 2022.</p>
      <p>URL: https://arxiv.org/abs/2212.12672. arXiv:2212.12672.
[57] L. Kushnareva, T. Gaintseva, G. Magai, S. Barannikov, D. Abulkhanov, K. Kuznetsov, E. Tulchinskii,
I. Piontkovskaya, S. Nikolenko, Ai-generated text boundary detection with roft, 2024. URL: https:
//arxiv.org/abs/2311.08349. arXiv:2311.08349.
[58] Z. Zeng, S. Liu, L. Sha, Z. Li, K. Yang, S. Liu, D. Gašević, G. Chen, Detecting ai-generated sentences
in human-ai collaborative hybrid texts: challenges, strategies, and insights, in: Proceedings of
the Thirty-Third International Joint Conference on Artificial Intelligence, IJCAI ’24, 2024. URL:
https://doi.org/10.24963/ijcai.2024/835. doi:10.24963/ijcai.2024/835.
[59] Z. Zeng, L. Sha, Y. Li, K. Yang, D. Gašević, G. Chen, Towards automatic boundary detection for
human-ai collaborative hybrid essay in education, 2023. URL: https://arxiv.org/abs/2307.12267.
arXiv:2307.12267.
[60] T. Chakrabarty, P. Laban, C.-S. Wu, Can ai writing be salvaged? mitigating idiosyncrasies and
improving human-ai alignment in the writing process through edits, 2025. URL: https://arxiv.org/
abs/2409.14509. arXiv:2409.14509.
[61] E. Artemova, J. S. Lucas, S. Venkatraman, J. Lee, S. Tilga, A. Uchendu, V. Mikhailov, Beemo:
Benchmark of expert-edited machine-generated outputs, in: L. Chiruzzo, A. Ritter, L. Wang (Eds.),
Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for
Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), Association
for Computational Linguistics, Albuquerque, New Mexico, 2025, pp. 6992–7018. URL: https://
aclanthology.org/2025.naacl-long.357/.
[62] S. Saha, S. Feizi, Almost ai, almost human: The challenge of detecting ai-polished writing, 2025.</p>
      <p>URL: https://arxiv.org/abs/2502.15666. arXiv:2502.15666.
[63] G. Bao, L. Rong, Y. Zhao, Q. Zhou, Y. Zhang, Decoupling content and expression: Two-dimensional
detection of ai-generated text, arXiv preprint arXiv:2503.00258 (2025).
[64] Z. Cheng, L. Zhou, F. Jiang, B. Wang, H. Li, Beyond binary: Towards fine-grained llm-generated
text detection via role recognition and involvement measurement, arXiv preprint arXiv:2410.14259
(2024).
[65] D. Mollá, Q. Xu, Z. Zeng, Z. Li, Overview of the 2024 ALTA shared task: Detect automatic
AI-generated sentences for human-AI hybrid articles, in: T. Baldwin, S. J. Rodríguez Méndez,
N. Kuo (Eds.), Proceedings of the 22nd Annual Workshop of the Australasian Language Technology
Association, Association for Computational Linguistics, Canberra, Australia, 2024, pp. 197–202.</p>
      <p>URL: https://aclanthology.org/2024.alta-1.17/.
[66] M. Li, E. Hovy, J. Lau, Summarizing multiple documents with conversational structure for
meta-review generation, in: H. Bouamor, J. Pino, K. Bali (Eds.), Findings of the Association for
Computational Linguistics: EMNLP 2023, Association for Computational Linguistics, Singapore,
2023, pp. 7089–7112. URL: https://aclanthology.org/2023.findings-emnlp.472/. doi: 10.18653/v1/
2023.findings-emnlp.472.
[67] N. Dycke, I. Kuznetsov, I. Gurevych, NLPeer: A unified resource for the computational study of
peer review, in: A. Rogers, J. Boyd-Graber, N. Okazaki (Eds.), Proceedings of the 61st Annual
Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Association
for Computational Linguistics, Toronto, Canada, 2023, pp. 5049–5073. URL: https://aclanthology.
org/2023.acl-long.277/. doi:10.18653/v1/2023.acl-long.277.
[68] A. Radford, J. Wu, R. Child, D. Luan, D. Amodei, I. Sutskever, Language models are unsupervised
multitask learners, 2019. URL: https://api.semanticscholar.org/CorpusID:160025533.
[69] T. B. Brown, B. Mann, N. Ryder, M. Subbiah, J. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam,
G. Sastry, A. Askell, S. Agarwal, A. Herbert-Voss, G. Krueger, T. Henighan, R. Child, A. Ramesh,
D. M. Ziegler, J. Wu, C. Winter, C. Hesse, M. Chen, E. Sigler, M. Litwin, S. Gray, B. Chess, J. Clark,
C. Berner, S. McCandlish, A. Radford, I. Sutskever, D. Amodei, Language models are few-shot
learners, 2020. URL: https://arxiv.org/abs/2005.14165. arXiv:2005.14165.
[70] N. S. Keskar, B. McCann, L. R. Varshney, C. Xiong, R. Socher, CTRL: A conditional transformer
language model for controllable generation, CoRR abs/1909.05858 (2019). URL: http://arxiv.org/
abs/1909.05858. arXiv:1909.05858.
[71] X. Li, P. Yu, C. Zhou, T. Schick, O. Levy, L. Zettlemoyer, J. Weston, M. Lewis, Self-alignment with
instruction backtranslation, arXiv preprint arXiv:2308.06259 (2023).
[72] B. Li, H. Qi, K. Yan, Team Bohan Li at PAN: DeBERTa-v3 with R-Drop regularization for Human-AI
Collaborative Text Classification, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes
of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[73] M. Zheng, Y. Zhong, F. Liu, T. Xian, M. Xie, W. Wu, Z. Zhang, Q. Sun, StarBERT: A Hybrid Neural
Network Model for Human-AI Collaborative Text Classification, in: G. Faggioli, N. Ferro, P. Rosso,
D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum,
CEUR-WS.org, 2025.
[74] T. Xian, Y. Zhong, F. Liu, M. Xie, Q. Sun, M. Zheng, W. Wu, Z. Zhang, DBG: Human-AI Collaborative
Text Classification with DeBERTa-enhanced Contextual and Geometric Attention, in: G. Faggioli,
N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[75] T. Li, Fine-Grained Human-AI Collaborative Text Classification Using DeBERTa, in: G. Faggioli,
N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[76] D. Ignatenko, K. Zaitsev, O. Shkriaba, ReText.Ai Team at PAN 2025: Applying a Multiple
Classification Heads to a Transformer Model for Human-AI Collaborative Text Classification, in: G. Faggioli,
N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[77] Q. Sun, L. Ma, W. Yang, T. Xian, M. Xie, W. Wu, Z. Zhang, M. Zheng, DeBERTa-FPN: Fusion
Feature Pyramid Network for Human-AI Collaborative Text Classification, in: G. Faggioli, N. Ferro,
P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the Evaluation
Forum, CEUR-WS.org, 2025.
[78] W. Wu, W. Yang, Z. Zhang, M. Xie, M. Zheng, T. Xian, Q. Sun, Bert_T for Human-AI Collaborative
Text Classification, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025
- Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[79] Z. Zhang, W. Yang, W. Wu, M. Xie, M. Zheng, Q. Sun, T. Xian, DBA: A Hybrid Neural Network
Model for Generative Human-AI Collaborative Text Classification, in: G. Faggioli, N. Ferro,
P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference and Labs of the Evaluation
Forum, CEUR-WS.org, 2025.
[80] D. G. Sánchez, J. Jimenez, M. Ramírez, J. Martinez, RoBERT-IA: Human-AI Collaborative Text
Classification, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025
Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[81] Y. Fuchuan, H. Cao, H. Zhongyuan, Sentence-Level AI-Generated Text Detection with Fine-Tuned
BERT, in: G. Faggioli, N. Ferro, P. Rosso, D. Spina (Eds.), Working Notes of CLEF 2025 - Conference
and Labs of the Evaluation Forum, CEUR-WS.org, 2025.</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>I.</given-names>
            <surname>Borrego-Obrador</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Chinea-Ríos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Franco-Salvador</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Fröbe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Heini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Kredens</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mayerl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Pęzik</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Rangel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Rosso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Stamatatos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wolska</surname>
          </string-name>
          , E. Zangerle, Overview of PAN 2023:
          <article-title>Authorship Verification, Multi-Author Writing Style Analysis, Profiling Cryptocurrency Influencers, and Trigger Detection, in: Experimental IR Meets Multilinguality, Multimodality, and Interaction</article-title>
          .
          <source>14th International Conference of the CLEF Association (CLEF</source>
          <year>2023</year>
          ), volume
          <volume>14163</volume>
          of Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
          <year>2023</year>
          , pp.
          <fpage>459</fpage>
          -
          <lpage>481</lpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>031</fpage>
          -42448-9\_
          <fpage>29</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Chulvi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Fersini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Heini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Kestemont</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Kredens</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mayerl</surname>
          </string-name>
          , R. OrtegaBueno, P. Pezik,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Rangel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Rosso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Stamatatos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wolska</surname>
          </string-name>
          , E. Zangerle, Overview of PAN 2022:
          <article-title>Authorship Verification, Profiling Irony and Stereotype Spreaders, and Style Change Detection, in: Experimental IR Meets Multilinguality, Multimodality, and Interaction</article-title>
          .
          <source>13th International Conference of the CLEF Association (CLEF</source>
          <year>2022</year>
          ), volume
          <volume>13186</volume>
          of Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
          <year>2022</year>
          . doi:
          <volume>10</volume>
          .1007/ 978-3-
          <fpage>031</fpage>
          -13643-6.
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Chulvi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G. L. D. L. P.</given-names>
            <surname>Sarracén</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Kestemont</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Manjavacas</surname>
          </string-name>
          , I. Markov,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mayerl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Rangel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Rosso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Stamatatos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wolska</surname>
          </string-name>
          , E. Zangerle, Overview of PAN 2021:
          <article-title>Authorship Verification, Profiling Hate Speech Spreaders on Twitter,and Style Change Detection</article-title>
          ,
          <source>in: 12th International Conference of the CLEF Association (CLEF</source>
          <year>2021</year>
          ), Springer,
          <year>2021</year>
          . URL: https://doi.org/10.1007/978-3-
          <fpage>030</fpage>
          -85251-1_
          <fpage>26</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Ghanem</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Giachanou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Kestemont</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Manjavacas</surname>
          </string-name>
          , I. Markov,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mayerl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Rangel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Rosso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Specht</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Stamatatos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          , E. Zangerle, Overview of PAN 2020:
          <article-title>Authorship Verification, Celebrity Profiling, Profiling Fake News Spreaders on Twitter, and Style Change Detection, in: Experimental IR Meets Multilinguality, Multimodality, and Interaction</article-title>
          .
          <source>11th International Conference of the CLEF Initiative (CLEF</source>
          <year>2020</year>
          ), volume
          <volume>12260</volume>
          of Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
          <year>2020</year>
          , pp.
          <fpage>372</fpage>
          -
          <lpage>383</lpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>030</fpage>
          -58219-7\_
          <fpage>25</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          , E. Richter,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <article-title>The Two Paradigms of LLM Detection: Authorship Attribution vs. Authorship Verification, in: The 63rd Annual Meeting of the Association for Computational Linguistics (ACL</article-title>
          <year>2025</year>
          )
          <article-title>(Findings)</article-title>
          ,
          <source>Association for Computational Linguistics</source>
          ,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>J.</given-names>
            <surname>Bevendorf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Karlgren</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Dürlich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Gogoulou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Talman</surname>
          </string-name>
          , E. Stamatatos,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <article-title>Overview of the “Voight-Kampf” Generative AI Authorship Verification Task at PAN</article-title>
          and
          <article-title>ELOQUENT 2024</article-title>
          , in: G. Faggioli,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Galuščáková</surname>
          </string-name>
          ,
          <string-name>
            <surname>A</surname>
          </string-name>
          . García Seco de Herrera (Eds.),
          <source>Working Notes of CLEF 2024 - Conference and Labs of the Evaluation Forum, CEUR Workshop Proceedings, CEUR-WS.org</source>
          ,
          <year>2024</year>
          , pp.
          <fpage>2486</fpage>
          -
          <lpage>2506</lpage>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-
          <volume>3740</volume>
          / paper-225.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>M.</given-names>
            <surname>Fröbe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wiegmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Kolyada</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Grahm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Elstner</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Loebe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Hagen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <article-title>Continuous Integration for Reproducible Shared Tasks with TIRA.io</article-title>
          , in: J.
          <string-name>
            <surname>Kamps</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Goeuriot</surname>
            ,
            <given-names>F.</given-names>
          </string-name>
          <string-name>
            <surname>Crestani</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Maistro</surname>
            ,
            <given-names>H.</given-names>
          </string-name>
          <string-name>
            <surname>Joho</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Davis</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Gurrin</surname>
            ,
            <given-names>U.</given-names>
          </string-name>
          <string-name>
            <surname>Kruschwitz</surname>
            ,
            <given-names>A</given-names>
          </string-name>
          . Caputo (Eds.),
          <source>Advances in Information Retrieval. 45th European Conference on IR Research (ECIR</source>
          <year>2023</year>
          ), Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York,
          <year>2023</year>
          , pp.
          <fpage>236</fpage>
          -
          <lpage>241</lpage>
          . URL: https://link. springer.com/chapter/10.1007/978-3-
          <fpage>031</fpage>
          -28241-6_
          <fpage>20</fpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>031</fpage>
          -28241-6_
          <fpage>20</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Mansurov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Ivanov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Su</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Shelmanov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Tsvigun</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O. M.</given-names>
            <surname>Afzal</surname>
          </string-name>
          , T. Mahmoud,
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>