<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Overview of the Multilingual Text Detoxification Task at PAN 2025</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Daryna Dementieva</string-name>
          <email>daryna.dementieva@tum.de</email>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vitaly Protasov</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Nikolay Babakov</string-name>
          <email>nikolay.babakov@usc.es</email>
          <xref ref-type="aff" rid="aff11">11</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Naquee Rizwan</string-name>
          <email>nrizwan@kgpian.iitkgp.ac.in</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ilseyar Alimova</string-name>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Caroline Brun</string-name>
          <email>caroline.brun@naverlabs.com</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vasily Konovalov</string-name>
          <email>vasily.konovalov@phystech.edu</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Arianna Muti</string-name>
          <email>arianna.muti@unibocconi.it</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Chaya Liebeskind</string-name>
          <email>liebchaya@gmail.com</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Marina Litvak</string-name>
          <email>marinal@sce.ac.il</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Debora Nozza</string-name>
          <email>debora.nozza@unibocconi.it</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Shehryaar Shah Khan</string-name>
          <email>shehryaarshahkhan@gmail.com</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Sotaro Takeshita</string-name>
          <email>sotaro.takeshita@uni-mannheim.de</email>
          <xref ref-type="aff" rid="aff10">10</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Natalia Vanetik</string-name>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Abinew Ali Ayele</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Florian Schneider</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Xintong Wang</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Seid Muhie Yimam</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ashraf Elnagar</string-name>
          <xref ref-type="aff" rid="aff12">12</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Animesh Mukherjee</string-name>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alexander Panchenko</string-name>
          <email>a.panchenko@skol.tech</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Artificial Intelligence Research Institute</institution>
          ,
          <addr-line>Moscow</addr-line>
          ,
          <country country="RU">Russia</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Bahir Dar University</institution>
          ,
          <addr-line>Bahir Dar</addr-line>
          ,
          <country country="ET">Ethiopia</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Bocconi University</institution>
          ,
          <addr-line>Milan</addr-line>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Indian Institute of Technology</institution>
          ,
          <addr-line>Kharagpur</addr-line>
          ,
          <country country="IN">India</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Jerusalem College of Technology</institution>
          ,
          <addr-line>Jerusalem</addr-line>
          ,
          <country country="IL">Israel</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>NAVER Labs Europe</institution>
          ,
          <addr-line>Grenoble</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Shamoon Academic College of Engineering</institution>
          ,
          <addr-line>Beer Sheva</addr-line>
          ,
          <country country="IL">Israel</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>Skoltech</institution>
          ,
          <addr-line>Moscow</addr-line>
          ,
          <country country="RU">Russia</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>Technical University of Munich</institution>
          ,
          <addr-line>Munich</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>University of Hamburg</institution>
          ,
          <addr-line>Hamburg</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff10">
          <label>10</label>
          <institution>University of Mannheim</institution>
          ,
          <addr-line>Mannheim</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff11">
          <label>11</label>
          <institution>University of Santiago de Compostela</institution>
          ,
          <addr-line>Santiago de Compostela</addr-line>
          ,
          <country country="ES">Spain</country>
        </aff>
        <aff id="aff12">
          <label>12</label>
          <institution>University of Sharjah</institution>
          ,
          <addr-line>Sharjah, UAE</addr-line>
        </aff>
      </contrib-group>
      <abstract>
        <p>Despite diferent countries and social platform regulations, digital abusive speech persists as a significant challenge. One of the way to tackle abusive, or more specifically, toxic language can be automatic text detoxification-a text style transfer task (TST) of changing register of text from toxic to more non-toxic. We extend our previous Multilingual Text Detoxification (TextDetox) task to new languages-Italian, French, Hebrew, Hinglish, Japanese, and Tatar-suggesting participants to participate in multi-lingual and cross-lingual text detoxification challenges. We provide insights into new data collection, evaluation metrics, as well as dive into the participants results. Warning: This paper contains rude texts that only serve as illustrative examples.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;PAN 2025</kwd>
        <kwd>Multilingual Text Detoxification</kwd>
        <kwd>Text Style Transfer</kwd>
        <kwd>Multilingualism</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        While progress has been made in addressing digital violence [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ] and abusive speech [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ], there remains a
pressing need for more proactive approaches to moderating hate and toxic language. In our TextDetox
shared task, we continue to investigate text detoxification as a proactive method for toxic speech
moderation, now extending our coverage to 15 languages: English, Spanish, German, Chinese, Arabic,
Hindi, Ukrainian, Russian, Amharic, Italian, French, Hebrew, Hinglish, Japanese, and Tatar.
      </p>
      <p>In this shared task, we explored both setups—multilingual and cross-lingual one (Figure 1)—extending
parallel text detoxification data from TextDetox 2024 to 6 new languages [ 3]. The remainder of this
paper is structured as follows. Section 2 gives an overview of the TextDetox 2025 shared task rules.
Section 3 provides the full overview of the new multilingual parallel text detoxification dataset collection
per each language. In the following sections, the evaluation setups essentials are described—baselines
in Section 4, automatic evaluation setup in Section 5, and LLM-as-a-judge evaluation setup in Section 6.
The submissions from participants are described in Section 7. Section 8 provides the details about final
results—both automatic (Section 8.1) and LLM-as-a-judge (Section 8.2) evaluation leaderboards. Finally,
Section 9 concludes the paper.</p>
      <p>All the resources produced from the task are listed at the shared task page1 and are also mentioned
in the corresponding sections. All the data, classifiers, and text detoxification baselines are released for
a public usage at our HuggingFace space.2 Also, we provide additional information on the annotation
and detailed results at our Github repo for the corresponsing year.3</p>
    </sec>
    <sec id="sec-2">
      <title>2. Shared Task Rules</title>
      <sec id="sec-2-1">
        <title>The share task timeline was divided in to two phases—development and test.</title>
        <p>Development Phase This year, together with already existing parallel data for English and Russian
from previous works [4, 5], we released 400 parallel samples per each TextDetox 2024 language [6] as
training data. Then, we used previous year languages 600 toxic samples as a test set as well as 100 toxic
sentences per new 6 languages.</p>
        <p>Test Phase We extended the test data for new languages to full 600 toxic samples as well. Thus, now,
for all 15 languages, equal amount of toxic test instances are available. Participants were invited to
submit multilingual and cross-lingual solutions.</p>
      </sec>
      <sec id="sec-2-2">
        <title>1https://pan.webis.de/clef25/pan25-web/text-detoxification.html 2https://hf.co/textdetox 3https://github.com/textdetox/textdetox_clef_2025</title>
        <p>Leaderboards During both phases, the leaderboards based on automatic evaluation were available.
We used Codalab platform [7].4 At each phase leaderboard, we highlighted scores per each challenge—
AvgP for the languages with parallel training data available and AvgNP for new languages without any
training data. This year, we also significantly improved the automatic evaluation metrics (Section 5).
At the same time, for additional leaderboard, we provided as well LLM-as-a-judge results with
finetuned LLMs for text detoxification evaluation task (Section 6). Participants were asked to analyze their
performance from both leaderboards.</p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Multilingual Parallel Text Detoxification Dataset</title>
      <p>Firstly, we re-used the data from TextDetox 2024 shared task [6] for 9 languages—English, Spanish,
German, Chinese, Arabic, Hindi, Ukrainian, Russian, Amharic—as: (i) 400 parallel sentences per each
language now were used as a training data for all phases; (ii) 600 toxic sentences per each language
served as a part of the test sets for both dev and test phases.</p>
      <p>Then, for new languages—Italian, French, Hebrew, Hinglish, Japanese, and Tatar—we asked experts
and native speakers to contribute for new corpora collection. Further, we describe the collection
details per each language: , French (Section 3.1), Italian (Section 3.2), Hebrew (Section 3.3), Japanese
(Section 3.4), Hinglish (Section 3.5), and Tatar (Section 3.6).</p>
      <p>For all the data collection, we adapt the concept of English ParaDetox [4] collection pipeline. The
quality check consists of three main criteria:
Task 1: Rewrite text in a polite way Annotators need to provide the detoxified paraphrase of the
text so it becomes non-toxic and the main content is saved or to skip paraphrasing if the text is
not possible to rewrite in non-toxic way;
Task 2: Do these sentences mean the same? Check if the content is indeed the same between the
original toxic text and its potential non-toxic paraphrase;</p>
      <sec id="sec-3-1">
        <title>Task 3: Is this text ofensive?</title>
        <sec id="sec-3-1-1">
          <title>Verification of the provided paraphrase if it is indeed non-toxic.</title>
          <p>In the same manner, each language stakeholder asked the annotators to rewrite the toxic samples
verifying the main three criteria: (i) the new paraphrase should be non-toxic; (ii) the content should be
saved as much as possible; (iii) the resulted text should be fluent but may contain some minor mistakes
(as the majority of the original toxic samples are examples from posts from social networks).</p>
          <p>We explicitly communicated to language stakeholders that deletion of toxic words should be
considered only as a last resort in the detoxification process—used solely when rephrasing is not feasible.
Annotators were instructed to prioritize rephrasing toxic segments wherever possible, relying on
deletion only when no suitable neutral alternative could be constructed.</p>
          <p>For new languages, we obtained 600 parallel pairs from which toxic parts were revealed as dev (first
100) and test (full 600) sets.
3.1. French
We introduce the DetoxifyFR dataset, a novel detoxification dataset for French, comprising 600 toxic
comments and their human-written neutral rewrites, incorporated into the test phase of the shared
task.</p>
        </sec>
      </sec>
      <sec id="sec-3-2">
        <title>3.1.1. Input Data Preparation</title>
        <p>The DetoxifyFR dataset is constructed from two distinct sources:</p>
        <sec id="sec-3-2-1">
          <title>4hhttps://codalab.lisn.upsaclay.fr/competitions/22396</title>
          <p>FrenchToxicityPrompts [8]: 50,000 naturally occurring French samples, annotated with toxicity
scores from the Perspective API. This data originates from Lélu, a French dialogue dataset extracted
from Reddit’s public French datasets. Conversations are segmented into sentences using spaCy, with
Perspective API scores ranging from 0 (not toxic) to 100 (highly toxic) assigned to each sentence. We
retain toxic and highly toxic sentences (i.e., scores ≥ 50) as candidates for detoxification, resulting in
12,601 sentences.</p>
          <p>Jigsaw Multilingual Toxic Comment Classification test set [9]: 9,274 French samples annotated
as toxic or non-toxic, from which we retain 1,557 toxic samples as candidates for detoxification.</p>
        </sec>
      </sec>
      <sec id="sec-3-3">
        <title>3.1.2. Annotation Process</title>
        <p>LLM-based Pre-filtering Producing a neutral version of a toxic sentence while preserving its
meaning is not always feasible, as some sentences are inherently toxic and can not be detoxified without
a drastic change in content. To streamline the manual annotation process, we prompted
Llama-3.170B-Instruct to evaluate whether a sample could be detoxified. Only sentences deemed suitable for
detoxification were passed to the next phase. This LLM-based filtering retained 1,062 toxic candidates
from Jigsaw and 9,103 from FrenchToxicityPrompts. Interestingly, on FrenchToxicityPrompts, we
observe that the LLM filters out approximately 50% of the highly toxic sentences (toxicity ≥ 75) but
only 22% of the toxic sentences (50 ≤ toxicity &lt; 75).</p>
        <p>Manual Annotation We then randomly selected comments, evenly split between the two lfitered
datasets. The annotation process is entirely manual and does not rely on LLM-generated content. Initial
tests with LLMs for generating detoxified sentences revealed biases in the annotations, leading us to
adopt a fully manual approach. A total of 600 French samples, approximately evenly distributed from
the two data sources, were detoxified during this process.</p>
        <p>Annotator The annotator is a native French speaker with extensive experience in linguistic data
annotation and holds a PhD in computational linguistics.</p>
        <p>LLM-based Validation We applied a validation step on the final data using an LLM-based evaluation.
For this, we prompted Qwen2.5-72B-Instruct to assess the toxicity score of the 600 detoxified sentences
on a 5-point scale (from 0: not ofensive to 4: toxic) and content preservation (from 0: same content to
4: diferent content). Results are presented in Tables 1 and 2.</p>
        <p>The LLM judge demonstrates high-quality detoxified data, with 96% of sentences rated non-ofensive
(Table 1). Additionally, Table 2 shows strong content preservation, with 61.6% of sentences having only
slight tone changes and 24.3% exhibiting reduced emotional intensity (e.g., lowered aggressiveness),
indicating efective detoxification while largely maintaining content integrity.
3.2. Italian
We introduce the DetoxifyIT dataset, a new detoxificaton dataset for Italian, featuring 600 toxic
comments and their human-written neutral rewrites.</p>
      </sec>
      <sec id="sec-3-4">
        <title>3.2.1. Input Data Preparation</title>
        <p>We use three datasets for manual detoxification: two from Twitter and one from Wikipedia. The Twitter
datasets originate from EVALITA shared tasks—AMI (2020), focused on misogyny [10], and HODI (2023),
targeting homotransphobia [11], each comprising approximately 5,000 annotated tweets. Posts are
labeled as either hate or non-hate speech, with additional subcategories provided for hateful content.
The Wikipedia dataset is drawn from Jigsaw’s Multilingual Toxic Comment Classification Challenge 5,
and consists entirely of toxic comments.</p>
        <p>To identify content suitable for manual detoxification, we applied a multi-stage filtering process. All
datasets were constrained to a post length of 5–30 words to ensure contextual clarity. Since hate speech
and toxicity labels do not fully overlap — for instance, non-toxic content may still be hateful and vice
versa — we additionally used the Perspective API6 to obtain toxicity scores for the Twitter data. We
excluded tweets that were either insuficiently toxic or excessively severe, ultimately retaining 400
tweets, with 200 per target group. A detailed description of the filtering methodology is provided in
[12].</p>
      </sec>
      <sec id="sec-3-5">
        <title>3.2.2. Annotation Process</title>
        <p>The annotation process followed the guidelines established by the 2024 edition of this shared task [6],
with the primary objectives of removing toxicity while preserving the original meaning. Annotators
were instructed to rephrase toxic content wherever possible, using deletion only as a last resort. Three
native Italian speakers with expertise in NLP and toxic language worked on the rewrites. The process
was collaborative: one annotator rewrote the first set of 100 texts, and then all three reviewed them
together to ensure consistency with the guidelines. This group review was repeated after 300 and 600
texts. The final dataset includes only texts on which all three annotators agreed. A fourth expert later
reviewed the full set and suggested small improvements where needed.
3.3. Hebrew
We introduce the HeDetox dataset, a new detoxification dataset for Hebrew constructed from ofensive
online forum comments and annotated through a multi-stage process. Our approach builds on prior
linguistic taxonomies and recent prompting techniques for detoxification.</p>
        <sec id="sec-3-5-1">
          <title>5https://kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification 6https://www.perspectiveapi.com/</title>
        </sec>
      </sec>
      <sec id="sec-3-6">
        <title>3.3.1. Input Data Preparation</title>
        <p>The Hebrew HeDetox dataset is derived from user-generated content on a popular Israeli news forum
(https://rotter.net/forum/listforum.php), where emotionally charged discussions on current events are
frequent. A custom web scraping pipeline was used to collect discussion threads, extract metadata
(e.g., timestamps, post IDs), and normalize the comment text. A strict anonymization process removed
personally identifiable information such as usernames, mentions, and embedded links.</p>
        <p>To identify toxic content, we employed a few-shot classification method using large language models
(LLMs), prompted with definitions and reasoning chains based on the Simplified Ofensive Language
(SOL) Taxonomy [13]. This taxonomy provides a stepwise classification structure that includes ofense
type, target, vulgarity level, and implicit linguistic devices (e.g., irony, metaphor). Each comment was
annotated by the LLM as explicitly ofensive, implicitly ofensive, or non-ofensive. To ensure precision,
we retained only those labeled as explicitly ofensive, discarding borderline or ambiguous examples.
Input Toxicity Data The toxic inputs for HeDetox were selected from the scraped corpus based on
LLM classifications and filtered for explicit ofensiveness to serve as input for the detoxification task.</p>
      </sec>
      <sec id="sec-3-7">
        <title>3.3.2. Annotation Process</title>
        <p>Annotation Tasks We adapted the few-shot Chain-of-Thought (CoT) prompting framework
introduced by Dementieva et al. [14] for Hebrew. A custom Hebrew-language prompt was designed to guide
the LLM in identifying elements of toxicity and producing neutralized rewrites. The prompt included
keyword-based reasoning, strict instructions to preserve meaning and tone, and negative examples that
illustrated undesirable behaviors such as unsolicited advice or paraphrasing. We applied the prompt in
a few-shot setting, providing two in-context examples before detoxifying each ofensive sentence.
Manual Correction of LLM Outputs To assess the quality of LLM-generated detoxifications, we
implemented a two-phase manual correction process. In the first phase, 100 sentences were
independently reviewed and revised by two annotators, with a third adjudicator resolving discrepancies and
ensuring adherence to annotation guidelines. Annotators were instructed to avoid common issues such
as over-softening, omission of key content, introduction of new information, imprecise synonym use,
and retention of toxic language.</p>
        <p>Inter-annotator agreement was evaluated using cosine similarity over sentence embeddings. We
compared heBERT [15], multilingual BERT (mBERT) [16], and traditional vector models (n-grams, tf-idf).
Table 3 presents the results. Despite syntactic variability, both heBERT and mBERT demonstrated
strong semantic agreement between annotators, while traditional syntactic representations showed
lower similarity.</p>
        <p>In the second phase (500 sentences), we streamlined the process by assigning one annotator and one
corrector per sentence. The annotator generated detoxified rewrites based on the same guidelines as in
phase one, and the corrector reviewed them to ensure semantic fidelity and minimal stylistic alteration.
This setup reduced variation and improved consistency across the dataset.
3.4. Japanese
The Japanese split is constructed by a CS graduate student who is a native Japanese speaker in the
following procedure:</p>
      </sec>
      <sec id="sec-3-8">
        <title>3.4.1. Input Data Preparation</title>
        <p>We base our data construction on the open2ch corpus [17],7 a large collection of user-generated texts
from a popular thread-based social platform in Japan covering various topics. We apply keyword-based
ifltering to obtain sentences that are likely to be toxic as our starting point for the annotation. 8 This
ifltering is applied to 10,000 sentences from the original dataset, and approximately 60% of the sentences
are detected as potentially toxic and compose the dataset for annotation described in the subsequent
section.</p>
      </sec>
      <sec id="sec-3-9">
        <title>3.4.2. Annotation Process</title>
        <p>Given the dataset of potentially toxic sentences, a native Japanese speaker from a CS PhD program
carried out the annotation. The semantics from the original texts are preserved as much as possible, and
the samples are omitted when (I) The whole text is toxic, making it unable to detoxify, (II) The original
text is not toxic at all. At the end of the annotation, 3488 sentences are considered, and 600 sentences
have been detoxified; other sentences could not be annotated due to the two previously mentioned
reasons. Most of the unannotated samples are invalid because of the reason (I). This is due to the nature
of the data source: a long-running, fully anonymized thread-based online platform.
3.5. Hinglish</p>
      </sec>
      <sec id="sec-3-10">
        <title>3.5.1. Input Data Preparation</title>
        <p>Input Toxicity Data We used the aggression annotated corpus of Hindi-English code-mixed data
proposed in [18] for framing 600 samples of toxic-detoxified pairs. Contents in the dataset are obtained
from Facebook and are made up of a combination of Hindi-English code-mixed posts that are relevant
within Indian subcontinent. Publicly available dataset9 consists of two splits– train and dev. We sampled
our data from train; dev split and remaining samples from test were used to train toxicity classifier.
Publicly available train split contains a total of 12,000 posts bifurcated into three categories OAG (overtly
aggressive), CAG (covertly aggressive) and NAG (non-aggressive) each containing 4856, 4869 and 2275
samples respectively. Since our work is centered on text detoxification, we carefully sample 600 posts
from OAG and CAG categories that have toxic contents and are detoxifiable.</p>
        <p>Input Preprocessing From an initial collection of 9,725 posts across the OAG and CAG categories,
we first filtered out posts written in Hindi, leaving us exclusively with samples in Hinglish. We then
performed deduplication through exact string matching to eliminate duplicate entries. To ensure
data cleanliness and consistency, mentions, links, and emojis were systematically removed. Posts
containing fewer than five tokens, separated by whitespace, were excluded from the dataset, while
those exceeding twenty-five tokens were reformulated to adhere to this length constraint. Importantly,
all these modifications were made in a way that preserved the original intent and toxicity levels of each
post.</p>
        <sec id="sec-3-10-1">
          <title>7https://hf.co/datasets/p1atdev/open2ch 8https://github.com/MosasoM/inappropriate-words-ja 9https://github.com/victor7246/Hinglish_Hate_Detection/blob/main/data/raw/trac1-dataset/hindi/agr_hi_train.csv</title>
        </sec>
      </sec>
      <sec id="sec-3-11">
        <title>3.5.2. Annotation Process</title>
        <p>Annotation Task(s) After the initial input preprocessing, posts were meticulously reviewed through
manual verification and were systematically categorized into two groups: detoxifiable and
nondetoxifiable. This classification, along with the rephrasing process discussed in previous section, was
conducted by an NLP researcher with practical expertise in hate speech and toxic language mitigation.</p>
        <p>From the preprocessed dataset of 4,824 posts, a total of 600 detoxifiable posts were collected. Collection
was halted once this target was reached. From these, a representative subset of 20 posts was carefully
selected as benchmark detoxification samples. These samples underwent expert review by two native
Hindi speakers to ensure high-quality reference standards. Using these expert-validated examples
as guidance, annotators were instructed to rephrase toxic content into non-toxic expressions while
preserving the original meaning of each post. The detoxification process was carried out independently
by two trained annotators, whose details are provided in a dedicated subsection. Each sample was
detoxified by one annotator.</p>
        <p>Annotators A male NLP researcher, with expertise in the detection and mitigation of hate speech
and toxic language, was engaged alongside a third-year undergraduate male student with practical
experience in machine learning. Both individuals are native Hindi speakers from India and possess an
in-depth understanding of the thematic content encompassed within the dataset. Furthermore, their
strong proficiency in reading and writing Hinglish ensures precise and nuanced annotation. Together,
they are responsible for executing a comprehensive detoxification of the entire dataset, leveraging their
specialized skills and linguistic expertise.
3.6. Tatar
We introduce the TatDetox dataset, a high-quality, fully manually annotated and validated detoxification
dataset for Tatar constructed from social media posts.</p>
      </sec>
      <sec id="sec-3-12">
        <title>3.6.1. Input Data Preparation</title>
        <p>The data was sourced from the Web Corpus, a collection that gathers texts from various resources
focusing on the minority languages of Russia.10 For the Tatar language, the Web Corpus provides posts
on the social network VKontakte. Two methods were applied for data filtering and selecting examples
for detoxification. The initial filtering was based on a toxic lexicon [ 6]. We used both Tatar and Russian
lexicons, as the texts feature code-switching and many obscene words used in Tatar are borrowed from
Russian. Additionally, a sentiment classifier was used [ 19]. The classifier identifies six emotions: Anger,
Joy, Sadness, Fear, Disgust, and Surprise. It was observed that the largest number of toxic texts appeared
in the Anger class. Thus, statements labeled with this category were selected for annotation.Texts
obtained through diferent filtering methods were combined, cleaned of HTML tags, anonymized, and
then sent for further annotation.</p>
      </sec>
      <sec id="sec-3-13">
        <title>3.6.2. Annotation Process</title>
        <p>Two annotators participated in the annotation process, both native speakers, one of whom is an expert
in the field of NLP. The annotators were tasked with checking the text for toxicity and writing a
detoxified version for toxic examples following the general guidelines provided by task organizers. The
annotators were also instructed to preserve the original spelling: if the text was written exclusively
in Russian letters, the rewritten version had to use only Russian letters as well; if Tatar letters were
used, the detoxified text needed to maintain this writing system. Cross-validation of the examples was
then carried out, with each annotator validating the examples provided by the other. A total of 1004
examples were selected for annotation, of which 600 were included in the final dataset.
10http://web-corpora.net/wsgi3/minorlangs/download</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. Baselines</title>
      <p>We provide five baselines for our shared task: (i) a trivial Duplicate baseline, (ii) a rule-based Delete
approach, (iii) a Backtranslation pipeline that reduces the task to a monolingual setting, (iv) a fine-tuned
mT0 model covering 9 out of the 15 languages, and (v) zero-shot LLMs with instruction prompts. The
code for all baselines is publicly available11.</p>
      <p>Duplicate A trivial baseline where the output is simply a copy of the input. No detoxification is
applied, and the original toxic content is returned unchanged.</p>
      <p>Delete This unsupervised baseline removes toxic or obscene substrings from the input text based
on predefined keyword lists. For the shared task, we compiled such lists for all 15 target languages
using publicly available resources (see Table 4). The number of keywords varies by language, reflecting
morphological diversity and difering ways of expressing toxicity. We release the full multilingual
keyword collection for participants and public use12.
Backtranslation This is a more sophisticated unsupervised baseline based on cross-lingual
transfer. The approach works by first translating non-English inputs into English using the NLLB-3.3B
model [20].13 Detoxification is then performed using the English-language model bart-base-detox,
ifne-tuned on the ParaDetox training set [ 4].14 Finally, the detoxified text is translated back into the
original target language using NLLB. For Hinglish, we use the specialized RLM-hinglish-translator
model.15
Fine-tuned mT0 We consider the mT0-XL-Detox-ORPO model [32],16 one of the top-performing
systems from the TextDetox 2024 shared task. Given its strong performance, we adopt it as a baseline
11https://github.com/pan-webis-de/pan-code/tree/master/clef25/text-detoxification/baselines
12https://hf.co/datasets/textdetox/multilingual_toxic_lexicon
13https://hf.co/facebook/nllb-200-3.3B
14https://hf.co/s-nlp/bart-base-detox
15https://hf.co/rudrashah/RLM-hinglish-translator
16https://hf.co/s-nlp/mt0-xl-detox-orpo
in this year’s competition. Although it was fine-tuned on only 9 of the 15 languages considered in this
competition, we observed promising zero-shot performance on the 6 remaining languages.
LLMs Prompting This baseline uses the Llama-3.1-70B-Instruct model,17 with few-shot examples
embedded in the instruction prompt. The model performs detoxification based on the given examples
without any task-specific fine-tuning. This baseline also relies on general-purpose instruction prompts
but uses various proprietary OpenAI models, including GPT-4 (0613), GPT-4o (2024-08-06), and GPT-3.5
(o3-mini-2025-01-31). We provide a basic detoxification prompt and evaluate the zero-shot capabilities
of these models without additional tuning (we utilized same prompt18 for all models).</p>
    </sec>
    <sec id="sec-5">
      <title>5. Automatic Evaluation Setup</title>
      <p>We adopt the evaluation approach from CLEF 2024 competition [6] and apply adjustments to the
underlying models. As in that work, we also measure the final detoxification metric as a combination of
three sub-metrics: toxicity, content similarity and fluency of the generated text compared to source and
reference texts. The evaluation script is available online.19
Toxicity Measurement (TOX) assesses how toxic the evaluated text is. Since the goal is to detoxify
the text, lower scores indicate better performance. We evaluate toxicity using an XLM-R [33] model
ifne-tuned on a multilingual toxicity corpus covering 15 languages. We released both the model 20
and the underlying dataset21. The model was fine-tuned using supervised learning (SFT) for a binary
classification task on around 5 000 samples per each language sampled from many corpora used for text
detoxification data selection. For calculating final toxicity score, we specifically consider the model’s
predicted probability that the generated text belongs to the «toxic» class. In contrast to the evaluation
made in the CLEF-2024 competition, we not only updated the model but also adjusted the approach for
calculating the final fluency score. This was done by comparing the probability that the generated text,
source input text, and reference output text belong to the «toxic» class, using the following rules: (i) if
the probability of the generated text is higher than for the source input, we penalize the score and set it
to zero; (ii) if the probability of the generated text is lower than for the reference text, we reward the
score and set it to one.</p>
      <p>Content similarity (SIM) evaluates how well the generated texts preserve key semantic information
from the original input. This metric penalizes outputs that miss essential content during the
detoxification process. Following CLEF-2024, we compute content similarity using cosine similarity between
LaBSE22 embeddings [34]. However, unlike the previous setup, which only measures similarity between
the source input and the generated text—ignoring reference outputs—our approach addresses this
limitation. We propose an improved metric that combines both input-output and output-reference similarities,
using a weighted sum. This enhancement captures not only fidelity to the original input but also
alignment with human-annotated reference texts, providing a more comprehensive evaluation of content
preservation: SIM(, , ) = cos(, ) * w, + cos(, ) * w, , where w, + w, = 1.
Fluency Estimation (FL) measures how natural, coherent, and grammatically correct a generated
text is—essentially, how closely it resembles language produced by a native speaker. In the
context of generated detoxification, this reflects whether the output reads smoothly and idiomatically
without spelling mistakes or unnatural constructions. In CLEF-2024, fluency was measured using
17https://hf.co/mlabonne/Llama-3.1-70B-Instruct-lorablated
18https://github.com/pan-webis-de/pan-code/tree/master/clef25/text-detoxification/baselines/openai
19https://github.com/pan-webis-de/pan-code/tree/master/clef25/text-detoxification
20https://hf.co/textdetox/xlmr-large-toxicity-classifier-v2
21https://hf.co/datasets/textdetox/multilingual_toxicity_dataset
22https://hf.co/sentence-transformers/LaBSE
8000
Fluency</p>
      <p>Content similarity</p>
      <p>Toxic pairwise score
0
1
0</p>
      <p>1
Value</p>
      <p>Value
ChrF [35] scores between generated outputs and human-annotated references. However, this approach
ignores the original toxic input, leading to a bias toward reference-style outputs and neglecting the
relevance of the transformation from source to target. To address this, we adopt XCOMET [36], a
metric originally designed for machine translation evaluation. Unlike ChrF, XCOMET considers the
input–generation–reference triplet, modeling fluency in the context of both the source and reference. It
leverages pretrained language models to assess fluency beyond surface-level matching, incorporating
deeper semantic and syntactic patterns. We specifically use myyycroft/XCOMET-lite [37], a compressed
version of Unbabel/XCOMET-XXL [36] that retains over 95% of its performance while reducing
computational cost by 60%. This eficiency makes it suitable for real-time evaluation in our competition
platform and for participant use.</p>
      <sec id="sec-5-1">
        <title>Joint score (J) is the aggregation of the three aforementioned metrics.</title>
        <p>
          J = 1 ∑=︀1 TOX(, , ) · SIM(, , ) · FL(, , ),
where TOX(, , ), SIM(, , ), FL(, , ) ∈ [
          <xref ref-type="bibr" rid="ref1">0, 1</xref>
          ] for each text detoxification output , source
toxic text  and reference annotated detoxification text .
        </p>
      </sec>
    </sec>
    <sec id="sec-6">
      <title>6. LLM as a Judge</title>
      <p>As an additional evaluation strategy, we employed LLMs as automatic judges to assess the quality of
system submissions. We explored two main paradigms: the use of out-of-the-box, pre-trained LLMs, and
customized LLMs fine-tuned specifically for the evaluation tasks. The out-of-the-box models utilized in
our experiments included GPT-4.1 mini, GPT-4.1 nano, CompassJudger-1-32B-Instruct [38],
DeepSeekR1-Distill-Qwen-32B [39], DeepSeek-V3-0324 [39], and Llama-3.3-70B-Instruct.23 The prompts used for
all models for three tasks are shown in Appenidx B.</p>
      <p>For model customization and further alignment with the requirements of the shared task, we
conducted additional fine-tuning experiments using Llama-3.1-8B 24 and Qwen-3-8B [40]25 models.
Finetuning was performed using the Low-Rank Adaptation (LoRA) [41] method to eficiently adapt the
base models while minimizing computational overhead. The main model weights were loaded in
4-bit quantized format, enabling faster training and reduced memory usage without significant loss in
performance. The LoRA configuration utilized the following hyperparameters: rank  = 8,  = 16, and
a dropout rate of 0.1, with adaptation applied to all linear layers. Optimization was carried out using
23https://hf.co/meta-llama/Llama-3.3-70B-Instruct
24https://hf.co/meta-llama/Llama-3.1-8B
25https://hf.co/Qwen/Qwen3-8B
a learning rate of 1 × 10− 4, cosine learning rate scheduler, weight decay of 1 × 10− 4, warmup ratio
set to 0.0, and maximum gradient norm of 1.0. The fine-tuning process was conducted for 2 epochs
over the training data. For each task (i.e. content similarity, style transfer, or fluence) we fine-tuned
standalone LORA.</p>
      <p>The evaluation models were trained (where applicable) and tested using datasets from the CLEF
TextDetox 2024 [6] and RUSSE 2022 [42, 43] competitions, both of which provide pairs of toxic and
detoxified paraphrases annotated with human quality scores. For our experiments, we utilized a subset
of the CLEF TextDetox 2024 dataset as the test set, while the remaining data from both CLEF 2024 and
RUSSE 2022 were used for training and development. This resulted in a total of 12,279 training pairs
and 4,320 test pairs.</p>
      <p>Figure 2 presents the distribution of the labels available for fine-tuning within the selected dataset. It
is evident that the data is skewed towards positive cases. While this imbalance is somewhat justifiable
for the toxic pairwise score and content similarity score, in the case of the fluency score, the skew is
particularly severe: the dataset contains 984 negative and 11, 295 positive instances. We conducted
an initial round of fine-tuning experiments for both of the selected models using the fluency score.
However, the extreme class imbalance resulted in models that consistently predicted only the positive
class, rendering the approach inefective. Consequently, we decided not to pursue further fine-tuning
experiments for fluency and do not report results for this setting. Therefore, our fine-tuning eforts
focused exclusively on content similarity and toxic pairwise scoring, while fluency is reported only for
the out-of-the-box LLMs.</p>
      <p>The results of our experiments are presented in Figure 3. It is evident that, for most languages, the
ifne-tuned Llama-3.1-8B outperforms the alternative LLMs on the tasks of content similarity and toxic
pairwise scoring. Based on these results, we selected the fine-tuned Llama 3.1-8B as the primary model
for these tasks.</p>
      <p>With respect to fluency, all models evaluated in their out-of-the-box configurations demonstrated
consistently low correlation with human judgments (below 0.35). As a result, we do not consider any
of the tested models suitable for the fluency assessment task in our shared task setting.</p>
    </sec>
    <sec id="sec-7">
      <title>7. Participants</title>
      <p>We received 25 submissions for the development phase leaderboard and 26 submissions for the test
phase leaderboard. Here, we briefly describe some solutions of our final participants:
Sky.Duan [44] Employs a parallel architecture that integrates both local models and large language
models for multilingual text detoxification. The system combines s-nlp/mt0-xl-detox-orpo and Qwen3
to leverage the strengths of specialized and general-purpose models, enabling robust and intelligent
detoxification across diferent languages.
d1n910 [45] Utilized a Chain-of-Thought (CoT) prompting approach with the Deepseek-r1 large
language model to enhance the reasoning capabilities and efectiveness of text detoxification.
Pratham [46] Developed a multilingual text detoxification system centered on MT0-XL with
taskspecific prompting, complemented by language-specific lexical filtering using custom toxic word
dictionaries. This hybrid approach ensured robust detoxification across 15 languages, particularly
handling challenges in code-mixed and morphologically rich languages through handcrafted filtering
rules.
ducanhhbtt [47] Built an eficient multilingual detoxification system leveraging the Gemma 3 12B
model with LoRA-based fine-tuning and advanced prompting, including few-shot retrieval and
chainof-thought reasoning. The approach combined progressive fine-tuning phases and extensive data
augmentation to ensure high performance across both high- and low-resource languages, all while
maintaining computational eficiency.
nikita.sushko [48] Applied supervised fine-tuning over the MultiParaDetox, SynthDetoxM, and a
custom synthetic dataset generated using the SynthDetoxM pipeline to enhance detoxification
performance.
humairafaridq [49] Proposed a prompt-driven, truly multilingual approach using only the
GPT-4omini model and in-context learning, where each toxic input is paired with a fixed instruction and three
language-specific toxic-to-neutral examples, eliminating the need for model fine-tuning.
Jiaozipi [50] Introduced a multilingual detoxification method based on an ensemble of large language
models (DeepSeek, Qwen, Kimi) guided by the RISE framework and hint engineering, using few-shot
examples and multi-dimensional evaluation to select optimal outputs without fine-tuning.</p>
      <sec id="sec-7-1">
        <title>Oleg_Papulov</title>
        <p>detoxification.</p>
        <p>Utilized Qwen3-0.6B with LoRA fine-tuning on neural-toxic text pairs to achieve
SVATS [51] Explored multiple model architectures (Qwen2-7B, Gemma-2 4B), comparing full and
LoRA fine-tuning, dataset variations, and multilingual vs. English-only strategies, while also evaluating
few-shot prompting with GPT-4o and a baseline deletion method for robust multilingual detoxification.
MetaDetox [52] Eliminated the need for fine-tuning by applying Chain-of-Thought prompting and
few-shot learning with DeepSeek, generating stylistically diverse rewrites for each input, and selecting
the best output via semantic similarity and toxicity-based reranking across 15 languages.
The Toxinators 2000 (jellyproll) Used the baseline MT0 model for most languages, while applying
a vocabulary replacement method for Hinglish and a combination of vocab replacement and MT0 for
Tatar and Japanese.</p>
        <p>Team Detox (Gopal) [53] Developed a hybrid system combining rule-based toxic span masking
with few-shot prompting of GPT-4o-mini, where toxic words are masked and both masked and original
sentences are provided to the model to generate detoxified outputs.</p>
        <p>Nililusu (ylmmcl) [54] Created a multilingual pipeline integrating lexicon- and classifier-based
toxicity detection, translation for non-English inputs, and ensemble detoxification using three generative
models, with outputs selected by evaluation metrics and back-translated to the original language.
wl2776</p>
        <p>Trained T5 model using the prompt "Detoxify: &lt;sentence&gt;" to perform text detoxification.
SomethingAwful Utilized Llama 3.1 with explicit reasoning for generation, combined with a
best-ofifve selection strategy using example-based Self-BLEU Scoring (SBS) to choose the optimal detoxified
output.</p>
      </sec>
    </sec>
    <sec id="sec-8">
      <title>8. Results</title>
      <p>Here, we provide the final results of the final test phase of our shared task for both automatic leaderboard
from Codalab (Section 8.1) and LLM-as-a-Judge (Section 8.2). The full detailed tables of results per each
language and per each metric set can be found in Appendix A.
8.1. Automatic Evaluation Leaderboard
The results of the automatic evaluation used in Codalab for languages with parallel training data (AvgP)
are presented in Table 5, for new languages without parallel training data—in Table 6.</p>
      <p>Several diverse teams succeeded in surpassing multiple baseline models presented in our shared task.
However, unlike in the previous edition, no team was able to outperform the human reference outputs
this year. Among the submissions, Team ducanhhbtt with Gemma model underneath achieved the
highest overall scores for languages with available training data, closely followed by Team MetaDetox,</p>
      <p>For the newly introduced languages, the performance landscape shifted notably—often quite
dramatically—compared to the original set. Starting with the baselines, the mT0 model, which was fine-tuned
only on the original nine languages, experienced a substantial drop in performance and was clearly
outperformed by GPT-4. This outcome is expected: mT0 lacks exposure to the new languages, whereas
GPT-4, as a more general-purpose large language model, has likely seen a broader range of languages
during pretraining. The only exception was Italian, where mT0 performed comparably well—likely due
to the presence of Spanish (a closely related language) in the training data.</p>
      <p>Team ducanhhbtt once again led the leaderboard, now achieving the highest average score (AvgNP)
across five of the six new languages, with only Tatar posing a challenge. For Tatar, jellyproll
outperformed other teams by incorporating targeted vocabulary substitution specifically tailored to this
underrepresented language. Notably, Team adugeen showed a significant performance boost on the
new languages, rising to second place overall.</p>
      <p>An important takeaway is that many more teams were able to surpass the best-performing GPT-4
baseline this year. This demonstrates that general-purpose LLMs, while powerful, are not suficient
out-of-the-box for specialized tasks like multilingual text detoxification—particularly in low-resource
settings. The top-performing teams efectively integrated insights from our shared task into their
pipelines, leveraging techniques such as cross-lingual transfer, vocabulary adaptation, and advanced
prompting. These results underscore the need for task- and language-specific adaptation to ensure
robust and culturally sensitive content moderation across diverse linguistic contexts.
baseline_backtranslation</p>
      <p>AvgNP*
8.2. LLM as a Judge Leaderboard
The results of the LLM-as-a-Judge evaluation for languages with parallel training data (AvgP) are
presented in Table 7, for new languages without parallel training data—in Table 8.</p>
      <p>With the updated evaluation setup, the overall ranking of systems shifted slightly, and new leading
teams emerged for several languages. Interestingly, as detailed in the full evaluation reports in the
Appendix A, LLM-as-a-Judge results show that, for languages such as English, Spanish, Russian,
Ukrainian, and Hebrew, some newly submitted systems significantly outperformed even the human
references. This trend reflects both the relative resource richness of certain languages—where modern
models likely benefit from more extensive exposure to toxicity-related data—and the emergence of
underrepresented languages where LLMs, due to their broader pretraining, now demonstrate greater
lfuency and generalization capabilities compared to older decoder-based models.</p>
      <p>In the final AvgP results, team MetaDetox secured first place, demonstrating top performance across
all nine languages with training data. They are closely followed by team ducanhhbtt, while team
adugeen now ranks third, maintaining the strongest performance for Ukrainian. Additionally, team
jellyproll achieved the highest score for Amharic, and team nikita.sushko led in Russian.</p>
      <p>For the new languages without training data, team adugeen now holds the highest average score,
with ducanhhbtt following closely behind. Impressively, ducanhhbtt achieved near top results
across all six new languages, aligning with earlier findings from the automatic evaluation. A notable
development is the strong performance by team humairafaridq, which unexpectedly placed third
overall for new languages. Significant shifts were observed particularly in the rankings for Hebrew
and Hinglish, highlighting the dynamic nature of model generalization in low-resource and culturally
specific settings.</p>
      <p>We congratulate all participating teams for their creative and impactful solutions—several of which
surpassed even proprietary systems like GPT-4. While this year’s evaluation setup showed stronger
alignment with human judgments, the results also underscore the continued need for developing more
robust, culturally aware evaluation metrics for multilingual text style transfer.</p>
    </sec>
    <sec id="sec-9">
      <title>9. Conclusion</title>
      <p>In Multilingual Text Detoxification shared task at PAN 2025, participants were tasked with transforming
text style from toxic to non-toxic across 15 languages: English, Spanish, German, Chinese, Arabic,
Hindi, Ukrainian, Russian, Amharic, Italian, French, Hebrew, Hinglish, Japanese, and Tatar. The task
was divided into two challenges: multilingual one with the parallel training data available as well
as cross-lingual one stressing models with new unseen languages. Participants’ submissions in both
phases underwent evaluation using an improved set of automatic metrics, followed by additional novel
LLM-as-a-Judge evaluation.</p>
      <p>We received a wide range of submissions leveraging both LLMs—such as DeepSeek, Gemma, Qwen,
and GPT-4—as well as fine-tuned decoder models like mT0, often enhanced with specialized
preprocessing or combined methodologies. Many participating teams succeeded in outperforming even our
strongest baselines, including fine-tuned on our data mT0 and GPT-4. These results highlight the
importance of adapting standard LLM prompting or fine-tuning approaches to the unique demands
of the text detoxification task, especially when dealing with underrepresented languages. Notably,
several systems even surpassed human reference outputs in languages such as English, Spanish, Russian,
Hebrew, and Ukrainian.</p>
      <p>This year, we also introduced a more comprehensive evaluation framework that combined improved
automatic metrics with an additional LLM-as-a-Judge setup. By comparing both leaderboards, we
observed consistent overall trends with slight shifts in ranking, though certain teams and languages
exhibited significant reordering. These variations reflect the inherent complexity and subjectivity of
tasks like text detoxification and proactive moderation of abusive speech—challenges that are deeply
influenced by linguistic and cultural context. The results emphasize the continued need for high-quality
human-annotated data and more sophisticated, culturally sensitive automatic evaluation metrics to
ensure fair and reliable evaluation process.</p>
    </sec>
    <sec id="sec-10">
      <title>Acknowledgment</title>
      <p>We express our deepest gratitude to Toloka.ai platform for our shared task support. Daryna Dementieva’s
work was additionally supported by Alexander Fraser’s TUM Heilbronn chair as well as Friedrich
Schiedel TUM Think Tank Fellowship. Naquee Rizwan, Shehryaar Shah Khan, and Animesh Mukherjee
would like to thank SPARC-II (Scheme for Promotion of Academic and Research Collaboration, Phase
II) project for funding international travel and subsistence to carry out this work. Ilseyar Alimova
would like to thank AIRI for funding the preparation of the Tatar dataset and Dina Abdullina for help
with dataset annotation. Arianna Muti’s and Debora Nozza’s research is supported by the European
Research Council (ERC) under the European Union’s Horizon 2020 research and innovation program
(grant agreement No. 101116095, PERSONAE). Arianna Muti and Debora Nozza are members of the
MilaNLP group and the Data and Marketing Insights Unit of the Bocconi Institute for Data Science and
Analysis.</p>
    </sec>
    <sec id="sec-11">
      <title>Declaration on Generative AI</title>
      <p>During this study, AI assistant was utilized in the writing process. ChatGPT was employed for
paraphrasing throughout the paper’s formulation followed by authors thorough additional verification.
Policies, platforms, and research integration. advancing nlp for hate speech proactive mitigation,
in: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics
(ACL), Association for Computational Linguistics, Vienna, Austria, 2025. URL: https://www.inf.
uni-hamburg.de/en/inst/ab/lt/publications/2025-rizwanetal-acl-hateprism.pdf.
[3] J. Bevendorf, D. Dementieva, M. Fröbe, B. Gipp, A. Greiner-Petter, J. Karlgren, M. Mayerl, P. Nakov,
A. Panchenko, M. Potthast, A. Shelmanov, E. Stamatatos, B. Stein, Y. Wang, M. Wiegmann,
E. Zangerle, Overview of PAN 2025: Voight-Kampf Generative AI Detection, Multilingual Text
Detoxification, Multi-Author Writing Style Analysis, and Generative Plagiarism Detection, in:
J. C. de Albornoz, J. Gonzalo, L. Plaza, A. G. S. de Herrera, J. Mothe, F. Piroi, P. Rosso, D. Spina,
G. Faggioli, N. Ferro (Eds.), Experimental IR Meets Multilinguality, Multimodality, and Interaction.
Proceedings of the Sixteenth International Conference of the CLEF Association (CLEF 2025),
Lecture Notes in Computer Science, Springer, Berlin Heidelberg New York, 2025.
[4] V. Logacheva, D. Dementieva, S. Ustyantsev, D. Moskovskiy, D. Dale, I. Krotova, N. Semenov,
A. Panchenko, ParaDetox: Detoxification with parallel data, in: Proceedings of the 60th Annual
Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Association
for Computational Linguistics, Dublin, Ireland, 2022, pp. 6804–6818. URL: https://aclanthology.
org/2022.acl-long.469. doi:10.18653/v1/2022.acl-long.469.
[5] D. Dementieva, V. Logacheva, I. Nikishina, A. Fenogenova, D. Dale, I. Krotova, N. Semenov, T.
Shavrina, A. Panchenko, RUSSE-2022: Findings of the First Russian Detoxification Shared Task Based
on Parallel Corpora, COMPUTATIONAL LINGUISTICS AND INTELLECTUAL TECHNOLOGIES
(2022). URL: https://api.semanticscholar.org/CorpusID:253169495.
[6] D. Dementieva, D. Moskovskiy, N. Babakov, A. A. Ayele, N. Rizwan, F. Schneider, X. Wang, S. M.</p>
      <p>Yimam, D. Ustalov, E. Stakovskii, A. Smirnova, A. Elnagar, A. Mukherjee, A. Panchenko, Overview
of the multilingual text detoxification task at PAN 2024, in: G. Faggioli, N. Ferro, P. Galuscáková,
A. G. S. de Herrera (Eds.), Working Notes of the Conference and Labs of the Evaluation Forum
(CLEF 2024), Grenoble, France, 9-12 September, 2024, volume 3740 of CEUR Workshop Proceedings,
CEUR-WS.org, 2024, pp. 2432–2461. URL: https://ceur-ws.org/Vol-3740/paper-223.pdf.
[7] A. Pavao, I. Guyon, A. Letournel, D. Tran, X. Baró, H. J. Escalante, S. Escalera, T. Thomas, Z. Xu,
Codalab competitions: An open source platform to organize scientific challenges, J. Mach. Learn.</p>
      <p>Res. 24 (2023) 198:1–198:6. URL: https://jmlr.org/papers/v24/21-1436.html.
[8] C. Brun, V. Nikoulina, FrenchToxicityPrompts: a large benchmark for evaluating and mitigating
toxicity in French texts, in: R. Kumar, A. K. Ojha, S. Malmasi, B. R. Chakravarthi, B. Lahiri, S. Singh,
S. Ratan (Eds.), Proceedings of the Fourth Workshop on Threat, Aggression &amp; Cyberbullying @
LREC-COLING-2024, ELRA and ICCL, Torino, Italia, 2024, pp. 105–114. URL: https://aclanthology.
org/2024.trac-1.12/.
[9] I. Kivlichan, J. Sorensen, J. Elliott, L. Vasserman, M. Görner, P. Culliton, Jigsaw multilingual toxic
comment classification.,
https://kaggle.com/competitions/jigsaw-multilingual-toxic-commentclassification, 2020.
[10] E. Fersini, D. Nozza, P. Rosso, AMI @ EVALITA2020: automatic misogyny identification, in:
V. Basile, D. Croce, M. D. Maro, L. C. Passaro (Eds.), Proceedings of the Seventh Evaluation
Campaign of Natural Language Processing and Speech Tools for Italian. Final Workshop (EVALITA
2020), Online event, December 17th, 2020, volume 2765 of CEUR Workshop Proceedings,
CEURWS.org, 2020. URL: https://ceur-ws.org/Vol-2765/paper161.pdf.
[11] D. Nozza, A. T. Cignarella, G. Damo, T. Caselli, V. Patti, HODI at EVALITA 2023: Overview of the
ifrst shared task on homotransphobia detection in italian, in: M. Lai, S. Menini, M. Polignano,
V. Russo, R. Sprugnoli, G. Venturi (Eds.), Proceedings of the Eighth Evaluation Campaign of Natural
Language Processing and Speech Tools for Italian. Final Workshop (EVALITA 2023), Parma, Italy,
September 7th-8th, 2023, volume 3473 of CEUR Workshop Proceedings, CEUR-WS.org, 2023. URL:
https://ceur-ws.org/Vol-3473/paper26.pdf.
[12] V. De Ruvo, A. Muti, D. Dementieva, D. Nozza, Detoxify-it: An italian parallel dataset for text
detoxification, in: The 9th Workshop on Online Abuse and Harms (WOAH), Association for
Computational Linguistics, 2025.
[13] B. Lewandowska-Tomaszczyk, A. Baczkowska, O. Dontcheva-Navrátilová, C. Liebeskind, G. V.</p>
      <p>Oleškevičienė, S. Žitnik, M. Trojszczak, R. Povolná, L. Selmistraitis, A. Utka, D. Gudelis, Llod
schema for simplified ofensive language taxonomy in multilingual detection and applications,
Lodz Papers in Pragmatics 19 (2023) 301–324. URL: https://doi.org/10.1515/lpp-2023-0016. doi:doi:
10.1515/lpp-2023-0016.
[14] D. Dementieva, N. Babakov, A. Ronen, A. A. Ayele, N. Rizwan, F. Schneider, X. Wang, S. M. Yimam,
D. Moskovskiy, E. Stakovskii, E. Kaufman, A. Elnagar, A. Mukherjee, A. Panchenko, Multilingual
and explainable text detoxification with parallel corpora, in: O. Rambow, L. Wanner, M. Apidianaki,
H. Al-Khalifa, B. D. Eugenio, S. Schockaert (Eds.), Proceedings of the 31st International Conference
on Computational Linguistics, Association for Computational Linguistics, Abu Dhabi, UAE, 2025,
pp. 7998–8025. URL: https://aclanthology.org/2025.coling-main.535/.
[15] A. Chriqui, I. Yahav, Hebert &amp; hebemo: a hebrew bert model and a tool for polarity analysis and
emotion recognition, INFORMS Journal on Data Science (2022).
[16] J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers
for language understanding, in: Proceedings of the 2019 Conference of the North American
Chapter of the Association for Computational Linguistics: Human Language Technologies, 2019,
pp. 4171–4186.
[17] M. Inaba, おーぷん2ちゃんねる対話コーパスを用いた用例ベース対話システム, in: 第87回
言語・音声理解と対話処理研究会(第10回対話システムシンポジウム), 人工知能学会研究
会資料SIG-SLUD-B902-33, 2019, pp. 129–132.
[18] R. Kumar, A. N. Reganti, A. Bhatia, T. Maheshwari, Aggression-annotated corpus of Hindi-English
code-mixed data, in: Proceedings of the Eleventh International Conference on Language Resources
and Evaluation (LREC 2018), European Language Resources Association (ELRA), Miyazaki, Japan,
2018. URL: https://aclanthology.org/L18-1226.
[19] S. H. Muhammad, N. Ousidhoum, I. Abdulmumin, J. P. Wahle, T. Ruas, M. Beloucif, C. de Kock,
N. Surange, D. Teodorescu, I. S. Ahmad, D. I. Adelani, A. F. Aji, F. D. M. A. Ali, I. Alimova,
V. Araujo, N. Babakov, N. Baes, A. Bucur, A. Bukula, G. Cao, R. T. Cardenas, R. Chevi, C. I.
Chukwuneke, A. Ciobotaru, D. Dementieva, M. S. Gadanya, R. Geislinger, B. Gipp, O. Hourrane,
O. Ignat, F. I. Lawan, R. Mabuya, R. Mahendra, V. Marivate, A. Piper, A. Panchenko, C. H. P. Ferreira,
V. Protasov, S. Rutunda, M. Shrivastava, A. C. Udrea, L. D. A. Wanzare, S. Wu, F. V. Wunderlich, H. M.
Zhafran, T. Zhang, Y. Zhou, S. M. Mohammad, BRIGHTER: bridging the gap in human-annotated
textual emotion recognition datasets for 28 languages, CoRR abs/2502.11926 (2025). URL: https:
//doi.org/10.48550/arXiv.2502.11926. doi:10.48550/ARXIV.2502.11926. arXiv:2502.11926.
[20] M. R. Costa-jussà, J. Cross, O. Çelebi, M. Elbayad, K. Heafield, K. Hefernan, E. Kalbassi, J. Lam,
D. Licht, J. Maillard, A. Y. Sun, S. Wang, G. Wenzek, A. Youngblood, B. Akula, L. Barrault, G. M.
Gonzalez, P. Hansanti, J. Hofman, S. Jarrett, K. R. Sadagopan, D. Rowe, S. Spruit, C. Tran, P. Andrews,
N. F. Ayan, S. Bhosale, S. Edunov, A. Fan, C. Gao, V. Goswami, F. Guzmán, P. Koehn, A. Mourachko,
C. Ropers, S. Saleem, H. Schwenk, J. Wang, No language left behind: Scaling human-centered
machine translation, CoRR abs/2207.04672 (2022). URL: https://doi.org/10.48550/arXiv.2207.04672.
doi:10.48550/ARXIV.2207.04672. arXiv:2207.04672.
[21] I. Shutterstock, List of dirty, naughty, obscene, and otherwise bad words,
https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words, 2020.</p>
      <p>Accessed: 2023-12-12.
[22] R. J. Gabriel, English full list of bad words and top swear words banned by google,
https://github.com/cofee-and-fun/google-profanity-words/blob/main/data/en.txt, 2023. Accessed:
2023-12-12.
[23] K. Bobrovnyk, The dictionary of ukrainian obscene words,
https://github.com/saganoren/obsceneukr, 2019. Accessed: 2023-12-12.
[24] A. Jiang, X. Yang, Y. Liu, A. Zubiaga, SWSR: A chinese dataset and lexicon for online sexism
detection, Online Soc. Networks Media 27 (2022) 100182. URL: https://doi.org/10.1016/j.osnem.
2021.100182. doi:10.1016/J.OSNEM.2021.100182.
[25] J. Lu, B. Xu, X. Zhang, C. Min, L. Yang, H. Lin, Facilitating fine-grained detection of Chinese
toxic language: Hierarchical taxonomy, resources, and benchmarks, in: A. Rogers, J. Boyd-Graber,
N. Okazaki (Eds.), Proceedings of the 61st Annual Meeting of the Association for Computational
Linguistics, 2023, pp. 16235–16250. URL: https://aclanthology.org/2023.acl-long.898.
[26] Meta Research, Toxicity-200, 2023. URL: https://github.com/facebookresearch/flores/blob/main/
toxicity/README.md, accessed July 2025.
[27] K. Hashimoto, 概要, https://github.com/MosasoM/inappropriate-words-ja, 2020.
[28] C. Liebeskind, M. Litvak, N. Vanetik, From linguistics to practice: a case study of ofensive
language taxonomy in Hebrew, in: Y.-L. Chung, Z. Talat, D. Nozza, F. M. Plaza-del Arco, P. Röttger,
A. Mostafazadeh Davani, A. Calabrese (Eds.), Proceedings of the 8th Workshop on Online Abuse
and Harms (WOAH 2024), Association for Computational Linguistics, Mexico City, Mexico, 2024,
pp. 110–117. URL: https://aclanthology.org/2024.woah-1.8/. doi:10.18653/v1/2024.woah-1.8.
[29] Wiktionary contributors, Catégorie: Insultes en français, 2021. URL: https://fr.wiktionary.org/wiki/</p>
      <p>Cat%C3%A9gorie:Insultes_en_fran%C3%A7ais, accessed July 2025.
[30] Wiktionary contributors, Catégorie: Termes vulgaires en français, 2021. URL: https://fr.wiktionary.</p>
      <p>org/wiki/Cat%C3%A9gorie:Termes_vulgaires_en_fran%C3%A7ais, accessed July 2025.
[31] P. Mathur, R. Sawhney, M. Ayyar, R. R. Shah, Did you ofend me? classification of ofensive tweets
in hinglish language, in: D. Fiser, R. Huang, V. Prabhakaran, R. Voigt, Z. Waseem, J. Wernimont
(Eds.), Proceedings of the 2nd Workshop on Abusive Language Online, ALW@EMNLP 2018,
Brussels, Belgium, October 31, 2018, Association for Computational Linguistics, 2018, pp. 138–148.</p>
      <p>URL: https://doi.org/10.18653/v1/w18-5118. doi:10.18653/V1/W18-5118.
[32] E. Rykov, K. Zaytsev, I. Anisimov, A. Voronin, Smurfcat at PAN 2024 textdetox: Alignment
of multilingual transformers for text detoxification, in: G. Faggioli, N. Ferro, P. Galuscáková,
A. G. S. de Herrera (Eds.), Working Notes of the Conference and Labs of the Evaluation Forum
(CLEF 2024), Grenoble, France, 9-12 September, 2024, volume 3740 of CEUR Workshop Proceedings,
CEUR-WS.org, 2024, pp. 2866–2871. URL: https://ceur-ws.org/Vol-3740/paper-276.pdf.
[33] A. Conneau, K. Khandelwal, N. Goyal, V. Chaudhary, G. Wenzek, F. Guzmán, E. Grave, M. Ott,
L. Zettlemoyer, V. Stoyanov, Unsupervised cross-lingual representation learning at scale, in:
D. Jurafsky, J. Chai, N. Schluter, J. R. Tetreault (Eds.), Proceedings of the 58th Annual Meeting of
the Association for Computational Linguistics, ACL 2020, Online, July 5-10, 2020, Association for
Computational Linguistics, 2020, pp. 8440–8451. URL: https://doi.org/10.18653/v1/2020.acl-main.
747. doi:10.18653/V1/2020.ACL-MAIN.747.
[34] F. Feng, Y. Yang, D. Cer, N. Arivazhagan, W. Wang, Language-agnostic BERT sentence embedding,
in: S. Muresan, P. Nakov, A. Villavicencio (Eds.), Proceedings of the 60th Annual Meeting of the
Association for Computational Linguistics (Volume 1: Long Papers), ACL 2022, Dublin, Ireland,
May 22-27, 2022, Association for Computational Linguistics, 2022, pp. 878–891. URL: https://doi.
org/10.18653/v1/2022.acl-long.62. doi:10.18653/V1/2022.ACL-LONG.62.
[35] M. Popovic, chrf: character n-gram f-score for automatic MT evaluation, in: Proceedings of
the Tenth Workshop on Statistical Machine Translation, WMT@EMNLP 2015, 17-18 September
2015, Lisbon, Portugal, The Association for Computer Linguistics, 2015, pp. 392–395. URL: https:
//doi.org/10.18653/v1/w15-3049. doi:10.18653/V1/W15-3049.
[36] N. M. Guerreiro, R. Rei, D. van Stigt, L. Coheur, P. Colombo, A. Martins, xcomet: Transparent
machine translation evaluation through fine-grained error detection, Transactions of the Association
for Computational Linguistics 12 (2023) 979–995. URL: https://api.semanticscholar.org/CorpusID:
264146484.
[37] D. Larionov, M. Seleznyov, V. Viskov, A. Panchenko, S. Eger, xCOMET-lite: Bridging the gap
between eficiency and quality in learned MT evaluation metrics, in: Y. Al-Onaizan, M. Bansal,
Y.-N. Chen (Eds.), Proceedings of the 2024 Conference on Empirical Methods in Natural Language
Processing, Association for Computational Linguistics, Miami, Florida, USA, 2024, pp. 21934–21949.</p>
      <p>URL: https://aclanthology.org/2024.emnlp-main.1223.
[38] M. Cao, A. Lam, H. Duan, H. Liu, S. Zhang, K. Chen, Compassjudger-1: All-in-one judge model
helps model evaluation and evolution, CoRR abs/2410.16256 (2024). URL: https://doi.org/10.48550/
arXiv.2410.16256. doi:10.48550/ARXIV.2410.16256. arXiv:2410.16256.
[39] DeepSeek-AI, Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning,
2025. URL: https://arxiv.org/abs/2501.12948. arXiv:2501.12948.
[40] Q. Team, Qwen3 technical report, 2025. URL: https://arxiv.org/abs/2505.09388.</p>
      <p>arXiv:2505.09388.
[41] E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, W. Chen, Lora:
Lowrank adaptation of large language models, in: The Tenth International Conference on
Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022, OpenReview.net, 2022. URL:
https://openreview.net/forum?id=nZeVKeeFYf9.
[42] D. Dementieva, V. Logacheva, I. Nikishina, A. Fenogenova, D. Dale, I. Krotova, N. Semenov, T.
Shavrina, A. Panchenko, RUSSE-2022: Findings of the First Russian Detoxification Shared Task Based
on Parallel Corpora, COMPUTATIONAL LINGUISTICS AND INTELLECTUAL TECHNOLOGIES
(2022). URL: https://api.semanticscholar.org/CorpusID:253169495.
[43] D. Dementieva, N. Babakov, A. Panchenko, MultiParaDetox: Extending text detoxification with
parallel data to new languages, in: K. Duh, H. Gomez, S. Bethard (Eds.), Proceedings of the 2024
Conference of the North American Chapter of the Association for Computational Linguistics:
Human Language Technologies (Volume 2: Short Papers), Association for Computational Linguistics,
Mexico City, Mexico, 2024, pp. 124–140. URL: https://aclanthology.org/2024.naacl-short.12.
[44] D. Xianbing, H. Zhongyuan, P. Jiangao, S. Kaiyin, Multilingual Text Detoxification System Based
on Parallel Architecture: An Intelligent Approach Integrating Local Models and Large Language
Models, in: Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum,
CEURWS.org, 2025.
[45] J. Peng, S. Kaiyin, L. Kaichuan, L. Zhankeng, H. Zhongyuan, A Multilingual Text Detoxification
Method Based on Chain-of-Thoughts Prompting Approach, in: Working Notes of CLEF 2025
Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[46] P. Shah, V. Shah, S. Kale, Multilingual Text Detoxification via Prompted MT0-XL and Lexical
Filtering, in: Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum,
CEUR-WS.org, 2025.
[47] T. D. A. Dang, F. P. D’Elia, GemDetox: Enhancing a massively multilingual model for text
detoxification on low-resource languages, in: Working Notes of CLEF 2025 - Conference and Labs
of the Evaluation Forum, CEUR-WS.org, 2025.
[48] A. Voronin, D. Moskovsky, N. Sushko, PAN 2025 Textdetox: Exploring a Sage-T5-like approach
for text detoxification, in: Working Notes of CLEF 2025 - Conference and Labs of the Evaluation
Forum, CEUR-WS.org, 2025.
[49] H. Farid, Z. Ahmad, A. Mahmood, I. Ameer, HF_Detox at PAN 2025 TextDetox: Prompt-Driven
Multilingual Detoxification, in: Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[50] X. Liu, Y. Yi, Z. Chen, S. Xu, Z. Ke, X. Guo, Y. Huang, W. Zhang, J. Chen, Y. Han, Jiaozipi at
CLEF 2025:A Multilingual Text Detoxification Method Based on Large Language Model-Based
Ensemble Learning, in: Working Notes of CLEF 2025 - Conference and Labs of the Evaluation
Forum, CEUR-WS.org, 2025.
[51] V. Kozlovskiy, A. Ploskin, S. Tantry, T. Matveeva, S. Savelyeva, Can Small Models Outperform
Large Ones in Text Detoxification?, in: Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.
[52] S. Bourbour, A. S. Kelishami, M. Gheysari, F. Rahimzadeh, Cross-Lingual Detoxification with
Few-Chain Prompting: A Competitive System for TextDetox 2025, in: Working Notes of CLEF
2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org, 2025.
[53] N. Krishna, L. Sai Teja, A. Mishra, Team Detox at PAN: Multilingual Text Detoxification using LLM,
in: Working Notes of CLEF 2025 - Conference and Labs of the Evaluation Forum, CEUR-WS.org,
2025.
[54] N. Lai-Lopez, S. Yuan, L. Wang, L. Zhang, Lexicon-Guided Detoxification and Classifier-Gated
Rewriting: A PAN 2025 Submission, in: Working Notes of CLEF 2025 - Conference and Labs of the
Evaluation Forum, CEUR-WS.org, 2025.</p>
    </sec>
    <sec id="sec-12">
      <title>A. Automatic and LLM-as-a-Judge Full Evaluation Results</title>
      <p>Here, we provide the extended results—from both automatic and LLM evaluation setups—based on
three evaluation parameters for all languages: Amharic (Table 9), Arabic (Table 10), German (Table 11),
English (Table 12), Spanish (Table 13), Hindi (Table 14), Russian (Table 15), Ukrainian (Table 16), Chinese
(Table 17), French (Table 18), Hebrew (Table 19), Hinglish (Table 20), Italian (Table 21), Japanese (Table 22),
and Tatar (Table 23). In every table, the baselines are highlighted with gray ; Human References are
highlighted with green ; the ordering is made by J score from LLM-as-a-Judge Evaluation results.
The automatic evaluation is based on the full test set of 600 samples per language; LLM evaluation was
performed on 100 set of the test set per language.</p>
      <p>J</p>
      <p>FL</p>
      <p>FL</p>
      <p>J</p>
      <p>J</p>
      <p>FL</p>
      <p>FL</p>
      <p>J</p>
      <p>FL</p>
      <p>FL</p>
      <p>J</p>
      <p>FL</p>
      <p>FL</p>
      <p>J</p>
      <p>FL</p>
      <p>FL</p>
      <p>J</p>
      <p>LLM evaluation</p>
      <p>SIM STA</p>
      <p>FL</p>
      <p>FL</p>
    </sec>
    <sec id="sec-13">
      <title>B. Prompts used for LLM as a judge</title>
      <p>Content similarity prompt
You are given with two texts. Your task is to compare the meaning of them and decide whether
they mean the same or not.</p>
      <sec id="sec-13-1">
        <title>Options: - yes (both texts have the same meaning) - no (there is significant diference in meaning between the two texts)</title>
      </sec>
      <sec id="sec-13-2">
        <title>Examples:</title>
        <p>text1: "Why do you behave like an asshole?"
text2: "Why do you act so bad?"
Answer: yes
text1: "This political pary is a gang of idiots"
text2: "They just dont know their stuf"
Answer: no
Style transfer evaluation prompt
You are given with two texts. Your task is to compare which text is more toxic.
Options:
- text1 (text1 is more toxic than text2)
- text2 (text2 is more toxic than text1)
- None (both texts have equal toxicity - both of them are either toxic or non-toxic at the same
time)</p>
      </sec>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>Z. R.</given-names>
            <surname>Shi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Fang</surname>
          </string-name>
          ,
          <article-title>Artificial intelligence for social good: A survey</article-title>
          , CoRR abs/
          <year>2001</year>
          .
          <year>01818</year>
          (
          <year>2020</year>
          ). URL: http://arxiv.org/abs/
          <year>2001</year>
          .
          <year>01818</year>
          . arXiv:
          <year>2001</year>
          .
          <year>01818</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>N.</given-names>
            <surname>Rizwan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S. M.</given-names>
            <surname>Yimam</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Dementieva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Skupin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Fischer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Moskovskiy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. A.</given-names>
            <surname>Borkar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Geislinger</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Saha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Roy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Semmann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Panchenko</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Biemann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Mukherjee</surname>
          </string-name>
          , HATEPRISM:
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>