<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Penta-nlp at EXIST 2024 Task 1-3: Sexism Identification, Source Intention, Sexism Categorization In Tweets Notebook for the EXIST Lab at CLEF 2024</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Fariha</forename><forename type="middle">Tanjim</forename><surname>Shifat</surname></persName>
							<email>fariha.tanjim.shifat@gmail.com</email>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Fabiha</forename><surname>Haider</surname></persName>
							<email>fabihahaider4@gmail.com</email>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>Md</roleName><forename type="first">Sakib</forename><surname>Ul</surname></persName>
						</author>
						<author>
							<persName><forename type="first">Rahman</forename><surname>Sourove</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Deeparghya</forename><surname>Dutta Barua</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Md</forename><surname>Farhan</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Islamic University of Technology</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Md</forename><surname>Fahim</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">CCDS Lab</orgName>
								<orgName type="institution">IUB</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Farhad</forename><surname>Alam Bhuiyan</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Research and Development</orgName>
								<orgName type="institution">Penta Global Limited</orgName>
								<address>
									<country key="BD">Bangladesh</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Penta-nlp at EXIST 2024 Task 1-3: Sexism Identification, Source Intention, Sexism Categorization In Tweets Notebook for the EXIST Lab at CLEF 2024</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">8524F667FBF2DEB8E99C55906BE5C9CE</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:59+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Sexism identification</term>
					<term>Tweets</term>
					<term>Source intention detection</term>
					<term>Sexism categorization</term>
					<term>Multilingual Models</term>
					<term>Natural Language Processing</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Social media platforms contains a vast user base and offers ease with which information can be shared. This can adversely facilitate the spread of sexist content which is infeasible for human monitoring and filtering. This paper investigates the automated detection of sexism in tweets using Natural Language Processing (NLP) techniques. Sexist tweets can create a hostile online environment and perpetuate harmful stereotypes. Manual identification is impractical due to the vast amount of data. The research proposes a system utilizing machine learning models to analyze text, identify bias and discriminatory language patterns, and flag tweets for moderation. The fourth edition of EXIST shared task 2024 Tweets Dataset, containing labeled English and Spanish tweets, is used to train and evaluate the models. The system explores various approaches, including TF-IDF with different classifiers (SVM, XGB, RF), Long Short-Term Memory (LSTM) networks with and without attention mechanisms, and pre-trained transformer models (XLM-Roberta, mBERT, BETO). The effectiveness of different preprocessing techniques and the role of attention weights in identifying sexism are also explored. The paper outlines the methodology, experimental setup, and analysis of results, paving the way for further discussion on error analysis and conclusions in subsequent sections.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>This research paper discusses topics related to specific content that may be sensitive or offensive, which some readers may find distressing. The intent is to analyze and understand the research work.</p><p>The commence and growth of internet have a profound impact on our social structure, the way we communicate, our relationships through the development of various social platforms. Twitter, one of such social platforms, has developed into a lively forum for sharing idea and discourse because of its succinct format and emphasis on real-time updates, with attractive features like hashtags, tags, etc. While such advancements of social platforms promotes connectivity and facilitates the spread of information it also tempts people to gain fame thorough views and likes, and throw inappropriate contents and comments in disguise of freedom of speech lacking empathy towards race, gender, religion <ref type="bibr" target="#b0">[1]</ref>. Evidently sexism exists in Twitter in the form of sexist tweets sometimes intentionally while at other times unintentionally. These tweets and contents can range from blatant objectification and insults to more implicit bias and prejudice. Sexist tweets can create a hostile environment online, especially for women and other targeted groups. Identifying sexism in online platforms is crucial for a various number of reasons. Recognizing sexist content is essential to advancing equality and averting social harm. The propagation of detrimental stereotypes and biases through such information contributes to gender inequality and has a detrimental effect on people's mental health and self-esteem <ref type="bibr" target="#b1">[2]</ref>. We can promote an inclusive and respectful culture by addressing sexist content, making sure that everyone feels respected and secure.</p><p>The sheer volume of such sexist tweets makes it difficult to identify sexism manually, requiring the immediate need for automated solutions. Natural Language Processing (NLP) can emerge as a successful tool in recognizing such harmful contents and filtering them out <ref type="bibr" target="#b2">[3]</ref>. It can prevent people to post contents that goes against community standards creating a safer and more acceptable platform. Using natural language processing (NLP) techniques, we can create automated systems that can recognize sexist tweets with accuracy. Text content can be analyzed by these technologies, which can also spot bias and discriminatory language patterns and flag tweets for moderation or additional review.</p><p>EXIST aims to capture sexism in a broad sense, from explicit misogyny to other subtle expressions that involve implicit sexist behaviours. EXIST is a series of scientific events and shared tasks on sexism identification in social networks <ref type="bibr" target="#b3">[4]</ref>, <ref type="bibr" target="#b4">[5]</ref>. The EXIST 2024 Tweets Dataset contains more than 10,000 labeled tweets, both in English and Spanish. Based on the labeled data, the tasks were to identify sexist tweets among them, the underlying intention of the author, and if it contained sexism at multiple degrees. The challenge lies in the nuanced and context-dependent nature of language on social media. Tweets often use slang, sarcasm, or coded language that can obscure sexist intent, while URLs can lead to external content that may contain sexist material not immediately evident in the tweet itself. Mentions and hashtags can complicate analysis by linking tweets to broader conversations or by being used to target specific individuals. Emojis add another layer of complexity, as their meanings can vary widely depending on context and cultural interpretation. These factors make automated detection systems prone to errors, requiring sophisticated algorithms and often human oversight to accurately identify and address sexism in tweets.</p><p>Our approach involved training different Machine Learning (ML) models to capture the sexist pattern in the contents and choose the one that performs the best on the validation dataset. The machine learning models included TF-IDF+SVM <ref type="bibr" target="#b5">[6]</ref>, TF-IDF+XGB <ref type="bibr" target="#b6">[7]</ref>, TF-IDF+RF <ref type="bibr" target="#b7">[8]</ref>, LSTM <ref type="bibr" target="#b8">[9]</ref>, LSTM+Attention <ref type="bibr" target="#b9">[10]</ref>, XLM-Roberta <ref type="bibr" target="#b10">[11]</ref>, mBert <ref type="bibr" target="#b11">[12]</ref>, BETO <ref type="bibr" target="#b12">[13]</ref>. We yielded results for different ways of preprocessing the data and further finetuned the models based on best results. Additionally, we attempted to find out the provoking words that contributed in the semantic meaning of sexism reflected through the attention weights of those words. Taking into accounts the attention weights of the sentence representation helped the same model to yield the best result for Task 1, Task 2 and Task 3. The results show that the Bert based models are well trained to capture the pattern of sexism when considering the attention layer. The performance metrics used were accuracy and F1 score and we could obtain an accuracy of 84.80%, 72.06% and 88.25% and F1 score of 84.76%, 51.43% and 54.77% for Task 1, Task 2 and Task 3, respectively. To do better modeling and include different explainability results, we adopt different training and explainable experiments from EDAL, ITPT, HateXplain <ref type="bibr" target="#b13">[14,</ref><ref type="bibr" target="#b14">15,</ref><ref type="bibr" target="#b15">16]</ref> papers. In the remainder of this paper, we present the Problem Description in Section 2, Methodology in Section 3, Experimental Setup in Section 4, Result Analysis in Section 5, Error Analysis in Section 6 and Conclusion in Section 7.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Problem Description</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Task Descriptions</head><p>In this work, we have addressed tasks 1, 2, and 3. The task descriptions are listed as per the guidelines.</p><p>1. Task 1 (Sexism Identification in Tweets): Given a tweet, this subtask aimed to classify whether a tweet contains sexist expressions or behaviors. The tweet can be sexist itself, describe a sexist situation or critic sexist behavior. It is a binary classification problem. We needed to label 'YES' or 'NO' to the tweets.</p><p>2. Task 2 (Source Intention in Tweets): This subtask aims to classify each tweet according to the intention of the person who wrote it. This is a multi-class classification problem. The classes are as follows:</p><p>• DIRECT: The intention is to write a message that is sexist itself.</p><p>• REPORTED: The author intends to report or describe a sexist situation or event suffered by a woman or women in the first or third person. • JUDGEMENTAL: The author intends to be judgemental since the tweet describes sexist situations or behaviors to condemn them. • NO: The tweet is detected as not sexist in subtask 1.</p><p>3. Task 3: This subtask categorizes the tweets according to the type of sexism. This is a multi-label classification problem with 5 labels. So more than one class can be assigned to each tweet. The labels are as follows:</p><p>• IDEOLOGICAL-INEQUALITY: The tweet discredits the feminist movement, rejects inequality between men and women, or presents men as victims of gender-based oppression. • STEREOTYPING-DOMINANCE: The tweet expresses false ideas about women that suggest they are more suitable to fulfill certain roles (mother, wife, family caregiver, faithful, tender, loving, submissive, etc.), or inappropriate for certain tasks (driving, hard work, etc.), or claims that men are somehow superior to women. • OBJECTIFICATION: The tweet presents women as objects apart from their dignity and personal aspects or assumes or describes certain physical qualities that women must have to fulfill traditional gender roles (compliance with beauty standards, hypersexualization of female attributes, women's bodies at the disposal of men, etc.). • SEXUAL-VIOLENCE: The tweet includes or describes sexual suggestions, requests for sexual favors, or harassment of a sexual nature (rape or sexual assault). • MISOGYNY-NON-SEXUAL-VIOLENCE: The tweet expresses hatred and violence towards women, different from that with sexual connotations. • NO: When none of the 5 labels are assigned to the tweet.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Dataset Statistics</head><p>The dataset includes over 10,000 tweets both in Spanish and English. The train, dev, and test sets contain 6064, 934, and 2076 tweets respectively. These numbers are after discarding the non-labeled samples in the gold standard. The distribution between both languages has been somewhat balanced to tackle the issue of biases. The exact figures are in Table <ref type="table" target="#tab_0">1</ref>. Task 1 is a binary classification problem with 'YES' and 'NO' labels. Task 2 is a multi-class classification problem with 3 sexism sources and one 'NO' label. Task 3 is a multi-label classification problem with 5 different labels and a 'NO' label when none of the labels are assigned. The distribution is not balanced. Each tweet is labeled by six different annotators. The gold label is the average of the labels. The exact figures of the distribution is in Table <ref type="table" target="#tab_1">2</ref>. They are then passed to the model which gives the encoded format of those tokens. Encoded tokens except CLS are passed to the attention module whereas CLS is passed to the MLP layer which then is aggregated using various methods and then this representation is used for classification. The logits are yield by the classifier.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Methodology</head><p>In this section, we will outline our methodology. Given the multilingual nature of the dataset encompassing both English and Spanish texts, we employ a multilingual pretrained model. We further refine this model through fine-tuning on the dataset. Our model architecture comprises five key components: i) Pretrained model backbone, ii) Sentence Representation from CLS token, iii) Attention-based Context Vector, iv) Feature Aggregation Module, and v) Classifier Head</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Pretrained Multilingual Model as Backbone</head><p>An input sentence 𝑆 is passed into the pretrained multilingual tokenizer to obtain the tokens of the sentence 𝑆 = {𝑡 [𝐶𝐿𝑆] , 𝑡 </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Sentence Representation from CLS Token</head><p>To get the representation for the entire sentence, we use the representation of [CLS] token ℎ [𝐶𝐿𝑆] . This representation is passed into a Single Layer Perception to get the enhanced representation.</p><formula xml:id="formula_0">ℎ ′ 𝐶𝐿𝑆 = 𝑊 𝐶𝐿𝑆 • ℎ CLS + 𝑏 𝐶𝐿𝑆</formula></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Attention-based Context Vector</head><p>As the tasks are natural language understanding tasks where individual words hold distinct predictive significance, we integrate an attention-based network to ascertain word importance. By accounting for the significance of each word, we construct a context vector for the sentence. We only consider the representations of the words and exclude the representations of the special tokens.</p><p>Once contextual representations 𝐻 are obtained for a sentence 𝑆, an additional attention layer is added to compute learnable attention scores 𝛼 𝑖 for each token 𝑡 𝑖 in 𝐻, and its calculation is as follows:</p><formula xml:id="formula_1">𝛼 𝑖 = softmax(𝑊 • ℎ 𝑖 + 𝑏), 𝑖 = 1, 2, . . . , 𝑛</formula><p>This results in a set of attention_scores = {𝛼 1 , 𝛼 2 , . . . , 𝛼 𝑛 } corresponding to the tokens in sentence 𝑆. These attention scores collectively represent the overall attention distribution across the sentence, indicating the relative importance or relevance of each token to the context of the entire sentence. After finding attention scores for each token, we find the context vector for the sentence 𝑆 by multiplying the contextual representations of token 𝑡 𝑖 with its attention score 𝛼 𝑖 .</p><formula xml:id="formula_2">𝑐 = 𝑛 ∑︁ 𝑖=1 𝛼 𝑖 • ℎ 𝑖</formula></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4.">Feature Aggregation Module</head><p>In this section, we consolidate various representations of the sentence 𝑆. We obtain two distinct representations: one based on the CLS token ℎ </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.5.">Classifier Head</head><p>After finding the aggregated feature representation 𝑧, it is fed into a classification layer. The representation is the logits 𝑧 is employed for the classification process by the following:</p><formula xml:id="formula_3">𝑧 ′ = 𝑊 • 𝑧 + 𝑏</formula><p>Finally, we calculate the Cross-Entropy (CE) loss based on 𝑧 ′ with the ground truth.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experimental Setup</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Model Selection</head><p>For the tasks, our initial approach involved a thorough exploration of various model types to determine the most appropriate one. Our investigation led us to examine three distinct categories: Machine Learning (ML) Models, Deep Learning Models, and Transformer-based Pretrained Models. For machine learning models, we considered Support Vector Machine (SVM), Random Forest (RF), and XGBoost.</p><p>Employing TF-IDF as our feature extractor, we processed each sentence through this mechanism before inputting it into the ML models for classification. For task 3, we consider Logistic Regression instead of XGBoost.</p><p>In our deep learning methodologies, we leverage both LSTM <ref type="bibr" target="#b16">[17]</ref> and LSTM + Attention models. As for transformer-based approaches, we've explored XLM-RoBERTa <ref type="bibr" target="#b17">[18]</ref>, mBERT <ref type="bibr" target="#b18">[19]</ref>, and BETO <ref type="bibr" target="#b19">[20]</ref>. The outcomes are detailed in Table <ref type="table" target="#tab_3">3</ref> for the development set. These model selections were driven by the presence of two distinct languages within the dataset. The total unique words for TF-IDF experiments were 39613 after deleting the punctuations, auxiliaries, and spaces. For LSTM experiments, we have used embedding dim = 50, hidden units = 64 of the LSTM layer, and output dim = 256 of a fully connected layer with a learning rate of 0.001. The experiments were done in 20 epochs. For the transformer-based models, we fine-tuned the pre-trained models. We use 10 epochs with a batch size of 32 Table <ref type="table" target="#tab_3">3</ref> presents the results of different models for Task 1 and Task 2 datasets, while Table <ref type="table" target="#tab_5">4</ref> displays the model performance specifically for Task 3 on the development set. Analysis of Table <ref type="table" target="#tab_3">3</ref> reveals that classical ML models exhibit competitiveness against deep learning counterparts. Notably, employing TF+IDF with XGBoost yields superior results to deep learning models. Incorporating the attention module leads to a notable 3% enhancement in accuracy for Task 1 and 1% improvement Task 2. Additionally, fine-tuning pretrained models demonstrates substantial improvements ranging from 6-9%. Among these, XLM-Roberta demonstrates optimal performance for both Task 1 and Task 2, thereby being selected as the final model for further experimentation. Turning to Task 3, as indicated in Table <ref type="table" target="#tab_5">4</ref>, mBERT marginally outperforms XLM-RoBERTa. Consequently, mBERT is chosen as the final model for Task 3.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Preprocessing</head><p>The datasets retrieved from Twitter contain additional information such as usernames and URLs alongside the original posts and comments. To gauge their impact, we conducted experiments using various preprocessing methods. These techniques included removing usernames, URLs, punctuation, and emojis. The outcomes of these experiments are presented in Table <ref type="table" target="#tab_6">5</ref>. From the table, we can see that removing url from the tweets improves the model performance for task 1 and task 3. For task 2, no preprocessing is helpful.  those results, we set a learning rate of 2e-5, random seed = 42 and batch size 32 = for tasks 1 &amp; 3 and 16 for task 3 for our model. We use AdamW optimizer in our experiments with betas = (0.9, 0.99). All experiments were conducted using Python (version 3.12) and PyTorch, leveraging the free NVIDIA Tesla P100 GPU provided by Kaggle. For the transformer based models we consider Huggingface transformers library. All the transformer based models were run for 10 epochs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.">Evaluation Metrics</head><p>When assessing the effectiveness of the models, we consider different performance metrics. Mainly we focus on Accuracy, Macro-F1, and ICM for our performance as our evaluation metircs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.1.">Accuracy</head><p>Accuracy measures the proportion of correctly classified instances among all instances. It is calculated by dividing the sum of true positives (correctly predicted positive instances) and true negatives (correctly predicted negative instances) by the total number of instances.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>𝐴𝑐𝑐𝑢𝑟𝑎𝑐𝑦 =</head><p>𝑇 𝑃 + 𝑇 𝑁 𝑇 𝑃 + 𝑇 𝑁 + 𝐹 𝑃 + 𝐹 𝑁</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.2.">Macro F1 Score</head><p>The F1 score is the harmonic mean of precision and recall. In the macro F1 score, each class is given equal weight, and the mean of these F1 scores across all classes is calculated. Macro F1 Score is calculated as follows:</p><formula xml:id="formula_4">F1-Score 𝑖 = 2 1 𝑃 𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 𝑖 + 1 𝑅𝑒𝑐𝑎𝑙𝑙 𝑖 Macro-F1 = 1 𝑁 𝑁 ∑︁ 𝑖=1 F1-Score 𝑖</formula></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.3.">ICM</head><p>The ICM metric functions as a similarity measure employed to assess the likeness between modelgenerated outputs and the actual ground truth in classification tasks. It does so by comaring the Information Content of catrgories presented through its features with the help of a Similarity function.</p><p>It extends the principles of Pointwise Mutual Information, a common metric used for evaluating relationships between words. When all parameters in the ICM formula are set to 1, it becomes equivalent to PMI. This means ICM can capture similar relationships as PMI but with more flexibility. ICM helps evaluate how well a system's output aligns with the ground truth by comparing the information content of the categories they represent. (ICM) is adopted for all tasks and evaluation types (hard-hard, hard-soft, soft-soft). ICM-soft is an extenstion of ICM for evaluating hierarchical multi-label classification tasks where there might be disagreements in the ground truth. It can can handle both soft system outputs and soft ground truth.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Result Analysis</head><p>Table <ref type="table" target="#tab_10">9</ref> exhibits the results of various sentence representation extractors employing different aggregation techniques across three tasks on the validation dataset. Regarding task 1, employing the CLS tokenbased representation yields an accuracy of 84.90% and an F1 score of 83.90%. Transitioning to context vector representation through an attention feature extractor results in a slight increase in the F1 score. Furthermore, combining both types of representation through addition-based aggregation shows a 2% enhancement in performance across the board. If we combine them using concat based aggregation techniques we see a small improvement in F1 score but not like the addition based aggregation one For task 2, we experience similar trend in the performance where attention based sentence representation performs better than CLS token based sentence extractor. If we combine both CLS and context vector getting from attention through addition based aggregation techniques, 1% improvement in accuracy and 2% improvement in F1 score. While we consider concat based aggregation techniques for combining both sentence level representation, the performance is further improved by 1.2% in accuracy and around 3% improvement in F1 score. For task 3, we also get better result than the CLS based baseline while we aggregate both sentence representations using concatenation. The top-performing model is chosen to generate predictions for the test dataset across all three tasks. The performance results on the test dataset are detailed in Table <ref type="table" target="#tab_11">10</ref>, showcasing the ICM-Hard, ICM-Hard Norm, and Macro-F1 scores as our performance metrics. Our best-performing model achieves an F1 score of 75.01 for all tweets, with 72.09 and 77.33 F1 scores recorded for English and Spanish tweets, respectively. For task 2, the score is 48.56, while task 3 reaches 43.79 across all tweet data. The table shows that our model can predict Spanish tweets more effectively than English tweets.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Error Analysis</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.">Confusion Matrix</head><p>The evaluation of our approach for Task 1 is done on the development dataset. The confusion matrix shown in 2(a) represents the performance of the classification. The number of True Positive is the number of correctly predicted positive cases. The model correctly predicted 388 cases as "Yes" (True Yes). True Negative cases is the number of correctly predicted negative cases. The model correctly predicted 418 cases as "NO" (True No). False Positive is the number of incorrectly predicted positive cases. The model incorrectly predicted 61 cases as "YES" when they were actually "NO" (False YES). False Negative (FN) is the number of incorrectly predicted negative cases. The model incorrectly predicted 67 cases as "NO" when they were actually "YES" (False NO). The model produced 806 correct classification as opposed to 128 misclassification.</p><p>Similarly, the confusion matrix is shown in 2(b) shows the result of the evaluation done on the development dataset for Task 2. As shown in the figure, out of 479 true NO labels, 420 cases were correctly predicted as NO, 150 were correctly predicted as DIRECT out of 204 true DIRECT cases, 36 were correctly predicted as REPORTED out of 65 true REPORTED cases and 18 were correctly predicted as JUDGEMENTAL out of 83 true JUDGEMENTAL cases. The model produced 624 correct classification as opposed to 217 misclassification.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2.">Attention Heatmap</head><p>We have calculated the Layer Integrated Gradient attributions for few specific input tweets using Captum <ref type="bibr" target="#b20">[21]</ref>. The attributions explain how each input element of a tweet contributes to the model's prediction for the target class. It means that the attention provided by the model to each tokens. So we name it attention heatmap. The darker the color in the cell of a token, the higher its attribution value  and the higher its contribution to the target. We have considered both English and Spanish tweet for the experiment and creating the heatmap. The corresponding labels are also listed in the Figure <ref type="figure" target="#fig_3">3</ref>. We can infer that for 'NO' label the most attentive tokens among the 4 tweets are last, abuse, ada, in and for 'YES' label the most attentive tokens among the 4 tweets are economy, s, a, LAS. This experiment was done considering the best performing model in the task 1. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Lang Attention heatmap for tokens of a sample tweet</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.">Most Attentive Tokens</head><p>We also extracted the most attentive tokens for each category in the validation dataset for further analysis. We determine these tokens by extracting the highest average self-attention scores for each token across the entire category-wise validation dataset. Transformer-based models compute attention scores to gauge the relationship between 𝑤𝑜𝑟𝑑 𝑖 and 𝑤𝑜𝑟𝑑 𝑗 across various layers and heads. Considering a transformer-based model with 𝐿 layers and 𝐻 heads per layer, and a sequence length of 𝑛 for a given sentence 𝑆, the resulting attention matrix has dimensions of 𝐿 × 𝐻 × 𝑛 × 𝑛. To compute the average attention score for token 𝑖 within sentence 𝑆, we follow this calculation:</p><formula xml:id="formula_5">Avg_Attn_Score 𝑖 𝑆 = 1 𝐿 • 𝐻 • 𝑛 𝐿 ∑︁ 𝑙=1 𝐻 ∑︁ ℎ=1 𝑛 ∑︁ 𝑡=1,𝑡̸ =𝑖 Attention 𝑖,𝑡,ℎ,𝑙</formula><p>To find category-wise average attention score for token 𝑖 on the validation dataset, we take the average of Avg_Attn_Score 𝑖 𝑆 for those sentences where token 𝑖 appears in. After calculating the token attention scores, we arrange them in descending order and select the top K words. In Table <ref type="table" target="#tab_0">11</ref>, we present the category-wise most attentive tokens predicted by the top-performing model for both Task 1 and Task 2.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 11</head><p>List of most attentive tokens for tasks 1 and 2 for each class with the best performing configuration for each task. The table presents a detailed breakdown of the most attentive tokens categorized by the topperforming model across two distinct tasks. In Task 1, where the classification pertains to identifying sexist content, the model identifies tokens such as "volant," "slut," and "cum" as prominent indicators for non-sexist content, while tokens like "penis, " "forever, " and "gangbang" are highlighted for identifying sexist content. Task 2, which involves various categories like "NO," "DIRECT," "REPORTED," and "JUDGEMENTAL, " exhibits a diverse range of most attentive tokens. For instance, in the "NO" category, tokens like "pun," "wall," and "question" stand out, while "DIRECT" category tokens include "where," "GPS, " and "tomorrow. " Additionally, tokens such as "wenr, " "school, " and "saben" are emphasized in the "REPORTED" and "JUDGEMENTAL" categories, respectively. This comprehensive analysis sheds light on the model's attention mechanism and its discernment of different types of content within each task.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Task</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.">Conclusion</head><p>The EXIST challenge is designed to promote research on automated sexism detection and modeling in online environments, with a particular focus on Twitter. In this paper, we performed extensive research and conducted thorough experiments to achieve this objective, employing advanced techniques in natural language understanding and machine learning. Specifically, we enhanced existing machine learning models by incorporating an attention layer and a CLS token, which emphasize the words that contribute to the context of sexism. Our findings demonstrate the effectiveness of our approach and models on both the training and validation datasets.</p><p>However, as the training is done on Spanish and English language the model might not be proficient at identifying sexist content in other languages. This can lead to misclassification of sexist Tweets.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>′</head><label></label><figDesc>𝐶𝐿𝑆 and the other based on attention-based context vector 𝑐. The aggregation of these representations is performed using two distinct techniques considering the two following techniques:• Concat-based Aggregation: For concat-based aggregation, we just simply concatenate ℎ ′ 𝐶𝐿𝑆 and 𝑐 to get the aggregated representation 𝑧 as follows: 𝑧_concat = concat[ℎ ′ 𝐶𝐿𝑆 , 𝑐] Then 𝑧_concat is passed into single layer MLP layer to get the final combined representation 𝑧 where 𝑧 = MLP(𝑧_concat) • Element wise Addition based Aggregation: In this aggregation, we combine ℎ ′ 𝐶𝐿𝑆 and 𝑐 by summing them element wise as follows 𝑧 𝑠𝑢𝑢𝑚𝑒𝑑 = ℎ ′ 𝐶𝐿𝑆 + 𝑐 Then 𝑧_suumed is passed into single layer MLP layer to get the final combined representation 𝑧 where 𝑧 = MLP(𝑧_suumed)</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Confusion matrix with all classes for dev set with best performing configuration of each task.</figDesc><graphic coords="11,90.83,65.61,203.07,158.41" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 3 :</head><label>3</label><figDesc>Figure 3: Attention heatmap for task 1 of a few samples of the dev set with XLM-RoBERTa and the best performing configuration.</figDesc><graphic coords="11,100.64,383.84,343.93,184.93" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Tweets Dataset, containing training, development and test splits. The dataset contained both English and Spanish language.</figDesc><table><row><cell cols="2">Data Splits Total Samples</cell><cell cols="2">Language Wise Samples EN ES</cell></row><row><cell>Train</cell><cell>6064</cell><cell>2870</cell><cell>3194</cell></row><row><cell>Dev</cell><cell>934</cell><cell>444</cell><cell>490</cell></row><row><cell>Test</cell><cell>2076</cell><cell>978</cell><cell>1098</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Class or Label-wise distribution in the dataset for Task 1, Task 2 and Task 3.</figDesc><table><row><cell></cell><cell></cell><cell cols="2">Class/Labels</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Samples Train Dev</cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>Task 1</cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell>NO</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">3376 479</cell><cell></cell></row><row><cell></cell><cell></cell><cell>YES</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">2697 455</cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>Task 2</cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell>NO</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">3367 479</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="2">DIRECT</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">1294 204</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="2">REPORTED</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>459</cell><cell>83</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="3">JUDGEMENTAL</cell><cell></cell><cell></cell><cell></cell><cell>376</cell><cell>75</cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>Task 3</cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="5">IDEOLOGICAL-INEQUALITY</cell><cell></cell><cell cols="2">1113 212</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="5">STEREOTYPING-DOMINANCE</cell><cell></cell><cell cols="2">1423 241</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="4">OBJECTIFICATION</cell><cell></cell><cell></cell><cell cols="2">1103 183</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="4">SEXUAL-VIOLENCE</cell><cell></cell><cell></cell><cell>675</cell><cell>123</cell><cell></cell></row><row><cell></cell><cell></cell><cell cols="6">MISOGYNY-NON-SEXUAL-VIOLENCE</cell><cell>856</cell><cell>158</cell><cell></cell></row><row><cell></cell><cell></cell><cell>NO</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">3367 479</cell><cell></cell></row><row><cell>I wanna go back</cell><cell>Pre-trained</cell><cell>Multilingual Tokenizer</cell><cell>I back wanna go</cell><cell>Pre-trained</cell><cell>Multilingual Model</cell><cell>h [CLS] h back h I h wanna h</cell><cell>Attention MLP</cell><cell></cell><cell>Aggregation</cell><cell>Classifier</cell><cell>Logits</cell></row></table><note>go Figure 1: Model Architecture. Single sentence is passed through the tokenizer to separate the tokens.</note></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head></head><label></label><figDesc>The tokens are passed into the pretrained multilingual model to achieve contextual representations for each token 𝑡 𝑖 , denoted as 𝐻 = {ℎ [𝐶𝐿𝑆] , ℎ 1 , ℎ 2 , . . . , ℎ 𝑛 , ℎ [𝑆𝐸𝑃 ] }, where ℎ 𝑖 represents the contextual representation of token 𝑡 𝑖 . Specifically, we extract the last layer hidden representations of the pretrained model, which are further fine-tuned during the dataset training.</figDesc><table /><note>1 , 𝑡 2 , . . . , 𝑡 𝑛 , 𝑡 [𝑆𝐸𝑃 ] }, where 𝑡 𝑖 represents the 𝑖-th token and 𝑡 [𝐶𝐿𝑆] &amp; 𝑡 [𝑆𝐸𝑃 ] are the special tokens.</note></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 3</head><label>3</label><figDesc>Comparing the Performance of Different Models in Validation Dataset for Task 1 and Task 2</figDesc><table><row><cell></cell><cell cols="2">Performance Metric</cell></row><row><cell>Model</cell><cell>Task 1</cell><cell>Task 2</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Dev Acc↑ Dev F1↑ Dev Acc↑ Dev F1↑ ML Models</head><label></label><figDesc></figDesc><table><row><cell>TF-IDF + SVM</cell><cell>70.88</cell><cell>69.73</cell><cell>61.12</cell><cell>26.70</cell></row><row><cell>TF-IDF + XGB</cell><cell>76.55</cell><cell>76.26</cell><cell>66.11</cell><cell>37.24</cell></row><row><cell>TF-IDF + RF</cell><cell>68.09</cell><cell>66.40</cell><cell>61.12</cell><cell>26.52</cell></row><row><cell></cell><cell cols="2">Deep Learning Models</cell><cell></cell><cell></cell></row><row><cell>LSTM</cell><cell>71.73</cell><cell>73.14</cell><cell>64.44</cell><cell>31.92</cell></row><row><cell>LSTM + Attention</cell><cell>74.95</cell><cell>74.82</cell><cell>65.28</cell><cell>32.87</cell></row><row><cell></cell><cell cols="3">Transformer based Models</cell><cell></cell></row><row><cell>XLM-RoBERTa</cell><cell>83.94</cell><cell>83.94</cell><cell>73.13</cell><cell>56.55</cell></row><row><cell>mBERT</cell><cell>81.05</cell><cell>81.04</cell><cell>71.58</cell><cell>53.16</cell></row><row><cell>BETO</cell><cell>80.73</cell><cell>80.72</cell><cell>70.99</cell><cell>54.27</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head>Table 4</head><label>4</label><figDesc>Comparing the Performance of Different Models in Validation Dataset for Task 3</figDesc><table><row><cell cols="3">Performance Metric</cell></row><row><cell>Model</cell><cell>Task 3</cell><cell></cell></row><row><cell cols="3">Dev Acc↑ Dev F1↑</cell></row><row><cell>ML Models</cell><cell></cell><cell></cell></row><row><cell>TF-IDF + LogisticRegression</cell><cell>85.89</cell><cell>60.85</cell></row><row><cell>TF-IDF + SVM</cell><cell>84.85</cell><cell>55.79</cell></row><row><cell>TF-IDF + RF</cell><cell>84.85</cell><cell>55.79</cell></row><row><cell cols="2">Transformer based Models</cell><cell></cell></row><row><cell>XLM-RoBERTa</cell><cell>87.69</cell><cell>47.76</cell></row><row><cell>mBERT</cell><cell>87.86</cell><cell>48.21</cell></row><row><cell>BETO</cell><cell>87.41</cell><cell>47.13</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_6"><head>Table 5</head><label>5</label><figDesc>Effect of different preprocessing in dev set.</figDesc><table><row><cell>Task</cell><cell>Model</cell><cell>Preprocessing</cell><cell cols="2">Performance Metric Dev Acc↑ Dev F1↑</cell></row><row><cell></cell><cell></cell><cell>No preprocessing</cell><cell>83.94</cell><cell>83.94</cell></row><row><cell cols="2">Task 1 XLM-RoBERTa</cell><cell>Username Removed URL Removed</cell><cell>83.73 84.80</cell><cell>83.73 84.76</cell></row><row><cell></cell><cell></cell><cell>Punctuation and Emoji Removed</cell><cell>84.37</cell><cell>84.36</cell></row><row><cell></cell><cell></cell><cell>No preprocessing</cell><cell>72.06</cell><cell>51.43</cell></row><row><cell cols="2">Task 2 XLM-RoBERTa</cell><cell>Username Removed URL Removed</cell><cell>71.94 71.11</cell><cell>52.32 52.00</cell></row><row><cell></cell><cell></cell><cell>Punctuation and Emoji Removed</cell><cell>71.11</cell><cell>53.82</cell></row><row><cell></cell><cell></cell><cell>No preprocessing</cell><cell>87.86</cell><cell>38.21</cell></row><row><cell>Task 3</cell><cell>mBERT</cell><cell>Username Removed URL Removed</cell><cell>87.76 88.25</cell><cell>40.94 54.77</cell></row><row><cell></cell><cell></cell><cell>Punctuation and Emoji removed</cell><cell>87.86</cell><cell>48.72</cell></row><row><cell>4.3. Settings</cell><cell></cell><cell></cell><cell></cell><cell></cell></row></table><note>For the hyper parameter settings, we also investigated with different values of them. We did ablation studies on learning rate with two different values [1e-5, 2e-5], on batch size with values<ref type="bibr" target="#b15">[16,</ref> 32]  and on random seed with values [0, 42]. The experimented results are reported in the Table<ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8]</ref>. Considering</note></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_7"><head>Table 6</head><label>6</label><figDesc>Effect of different learning rates in dev set.</figDesc><table><row><cell>Task Learning Rate Task 1 2 × 10 -5 1 × 10 -5 Task 2 2 × 10 -5 1 × 10 -5 Task 3 2 × 10 -5 1 × 10 -5</cell><cell>Performance Metric Dev Acc↑ Dev F1↑ 84.90 83.90 84.80 84.76 72.06 51.43 70.87 51.93 88.25 54.77 85.49 52.36</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_8"><head>Table 7</head><label>7</label><figDesc>Effect of different batch sizes in dev set.</figDesc><table><row><cell cols="2">Task Batch Size</cell><cell cols="2">Performance Metric Dev Acc↑ Dev F1↑</cell></row><row><cell>Task 1</cell><cell>32 16</cell><cell>84.90 82.01</cell><cell>83.90 81.92</cell></row><row><cell>Task 2</cell><cell>32 16</cell><cell>72.06 73.25</cell><cell>51.43 54.42</cell></row><row><cell>Task 3</cell><cell>32 16</cell><cell>88.25 85.35</cell><cell>54.77 51.69</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_9"><head>Table 8</head><label>8</label><figDesc>Effect of different seed values for model perforamce in dev set.</figDesc><table><row><cell cols="2">Task Random Seed</cell><cell cols="2">Performance Metric Dev Acc↑ Dev F1↑</cell></row><row><cell>Task 1</cell><cell>42 0</cell><cell>84.90 84.05</cell><cell>83.90 84.03</cell></row><row><cell>Task 2</cell><cell>42 0</cell><cell>73.25 71.70</cell><cell>54.42 52.32</cell></row><row><cell>Task 3</cell><cell>42 0</cell><cell>88.25 85.39</cell><cell>54.77 54.52</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_10"><head>Table 9</head><label>9</label><figDesc>Effect of different aggregation processes of the CLS token in dev set.</figDesc><table><row><cell>Task</cell><cell>Model</cell><cell cols="2">Sentence Repr Extractor Aggregation Method</cell><cell cols="2">Performance Metric Dev Acc↑ Dev F1↑</cell></row><row><cell></cell><cell>XLM-RoBERTa</cell><cell>CLS</cell><cell>-</cell><cell>84.90</cell><cell>83.90</cell></row><row><cell>Task 1</cell><cell>XLM-RoBERTa XLM-RoBERTa</cell><cell>Attention Attention + CLS</cell><cell>-Addition</cell><cell>84.15 86.30</cell><cell>83.94 86.28</cell></row><row><cell></cell><cell>XLM-RoBERTa</cell><cell>Attention + CLS</cell><cell>Concat</cell><cell>84.90</cell><cell>84.88</cell></row><row><cell></cell><cell>XLM-RoBERTa</cell><cell>CLS</cell><cell>-</cell><cell>72.06</cell><cell>51.43</cell></row><row><cell>Task 2</cell><cell>XLM-RoBERTa XLM-RoBERTa</cell><cell>Attention Attention + CLS</cell><cell>-Addition</cell><cell>72.77 73.13</cell><cell>52.65 54.65</cell></row><row><cell></cell><cell>XLM-RoBERTa</cell><cell>Attention + CLS</cell><cell>Concat</cell><cell>74.20</cell><cell>57.87</cell></row><row><cell></cell><cell>mBERT</cell><cell>CLS</cell><cell>-</cell><cell>88.12</cell><cell>54.77</cell></row><row><cell>Task 3</cell><cell>mBERT mBERT</cell><cell>Attention Attention + CLS</cell><cell>-Addition</cell><cell>87.97 84.40</cell><cell>51.50 53.50</cell></row><row><cell></cell><cell>mBERT</cell><cell>Attention + CLS</cell><cell>Concat</cell><cell>88.24</cell><cell>54.82</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_11"><head>Table 10</head><label>10</label><figDesc>EXIST test results for submitted predictions with the best performing configuration along with CLS token aggregation on the dev set.</figDesc><table><row><cell>Task Model [</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_12"><head>Agg. Type] Hard-Hard Rank ICM-Hard ICM-Hard Norm Macro F1</head><label></label><figDesc></figDesc><table><row><cell></cell><cell>XLM-RoBERTa +</cell><cell>ALL</cell><cell>29</cell><cell>47.79</cell><cell>74.02</cell><cell>75.08</cell></row><row><cell>Task 1</cell><cell>Attention +</cell><cell>EN</cell><cell>29</cell><cell>46.01</cell><cell>73.48</cell><cell>72.09</cell></row><row><cell></cell><cell>CLS [Addition]</cell><cell>ES</cell><cell>29</cell><cell>48.04</cell><cell>74.02</cell><cell>77.33</cell></row><row><cell></cell><cell>XLM-RoBERTa +</cell><cell>ALL</cell><cell>9</cell><cell>20.89</cell><cell>56.79</cell><cell>48.56</cell></row><row><cell>Task 2</cell><cell>Attention +</cell><cell>EN</cell><cell>13</cell><cell>12.03</cell><cell>54.16</cell><cell>44.31</cell></row><row><cell></cell><cell>CLS [Concat]</cell><cell>ES</cell><cell>10</cell><cell>27.34</cell><cell>58.54</cell><cell>51.96</cell></row><row><cell></cell><cell>mBERT +</cell><cell>ALL</cell><cell>14</cell><cell>-25.97</cell><cell>43.97</cell><cell>43.79</cell></row><row><cell>Task 3</cell><cell>Attention +</cell><cell>EN</cell><cell>17</cell><cell>-26.92</cell><cell>43.40</cell><cell>41.56</cell></row><row><cell></cell><cell>CLS [Concat]</cell><cell>ES</cell><cell>16</cell><cell>-27.41</cell><cell>43.88</cell><cell>44.60</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_13"><head>Label List of Most Attentive Tokens [Top 10]</head><label></label><figDesc></figDesc><table><row><cell>Task 1</cell><cell>NO YES</cell><cell>volant, slut, cum, hot , summer peligro, constante, staff, development , practic penis, forever, death, economy , ika GPS, tomorrow, exam, gangbang, room</cell></row><row><cell></cell><cell>NO</cell><cell>pun, wall, question, falso, auténtico modo, concert, syon, yes, making</cell></row><row><cell>Task 2</cell><cell>DIRECT REPORTED</cell><cell>where, GPS, tomorrow, exam, room attention, foot, cla, without, uni wenr, school, bath, dom, GAL ILE, ingu, ebla, dia, ACIÓN</cell></row><row><cell></cell><cell>JUDGEMENTAL</cell><cell>saben, ándose, cocina, make, sense want, mother, ape, ici, gas</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" xml:id="foot_0">arXiv:2009.07896.</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgements</head><p>This project has been sponsored by Penta Global Limited Bangladesh. We would like to express our deepest gratitude to Penta Global for for their financial support.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Social media and online hate</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">B</forename><surname>Walther</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Current Opinion in Psychology</title>
		<imprint>
			<biblScope unit="volume">45</biblScope>
			<biblScope unit="page">101298</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Elbarazi</surname></persName>
		</author>
		<idno type="DOI">10.19080/GJIDD.2023.12.555838</idno>
		<title level="m">How social media affects people&apos;s ideas on sexist behaviours and gender-based violence</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Automatic hate speech detection using killer natural language processing optimizing ensemble deep learning approach</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Al-Makhadmeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tolba</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Computing</title>
		<imprint>
			<biblScope unit="volume">102</biblScope>
			<biblScope unit="page" from="501" to="522" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Overview of EXIST 2024 -Learning with Disagreement for Sexism Identification and Characterization in Social Networks and Memes</title>
		<author>
			<persName><forename type="first">L</forename><surname>Plaza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Carrillo-De-Albornoz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Ruiz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Maeso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chulvi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Rosso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Amigó</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gonzalo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Morante</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Spina</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Experimental IR Meets Multilinguality, Multimodality, and Interaction. Proceedings of the Fifteenth International Conference of the CLEF Association</title>
				<meeting><address><addrLine>CLEF</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024. 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Overview of EXIST 2024 -Learning with Disagreement for Sexism Identification and Characterization in Social Networks and Memes (Extended Overview)</title>
		<author>
			<persName><forename type="first">L</forename><surname>Plaza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Carrillo-De-Albornoz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Ruiz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Maeso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chulvi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Rosso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Amigó</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gonzalo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Morante</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Spina</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2024 -Conference and Labs of the Evaluation Forum</title>
				<editor>
			<persName><forename type="first">G</forename><surname>Faggioli</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Ferro</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">P</forename><surname>Galuščáková</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><forename type="middle">G S</forename><surname>Herrera</surname></persName>
		</editor>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">A novel text mining approach based on tf-idf and support vector machine for news classification</title>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M H</forename><surname>Dadgar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">S</forename><surname>Araghi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">M</forename><surname>Farahani</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE International Conference on Engineering and Technology (ICETECH)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2016">2016. 2016</date>
			<biblScope unit="page" from="112" to="116" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Android malware detection in bytecode level using tf-idf and xgboost</title>
		<author>
			<persName><forename type="first">G</forename><surname>Ozogur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Erturk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Gurkas Aydin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Aydin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The Computer Journal</title>
		<imprint>
			<biblScope unit="volume">66</biblScope>
			<biblScope unit="page" from="2317" to="2328" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Emotion analysis in text using tf-idf</title>
		<author>
			<persName><forename type="first">V</forename><surname>Sundaram</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ahmed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Muqtadeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">R</forename><surname>Reddy</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2021 11th International Conference on Cloud Computing, Data Science &amp; Engineering (Confluence)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="292" to="297" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Lstm recurrent neural networks for short text and sentiment classification</title>
		<author>
			<persName><forename type="first">J</forename><surname>Nowak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Taspinar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Scherer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Artificial Intelligence and Soft Computing: 16th International Conference, ICAISC 2017</title>
				<meeting><address><addrLine>Zakopane, Poland</addrLine></address></meeting>
		<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2017">June 11-15, 2017. 2017</date>
			<biblScope unit="volume">16</biblScope>
			<biblScope unit="page" from="553" to="562" />
		</imprint>
	</monogr>
	<note>Proceedings, Part II</note>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Text classification based on lstm and attention</title>
		<author>
			<persName><forename type="first">X</forename><surname>Bai</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2018 Thirteenth International Conference on Digital Information Management (ICDIM), IEEE</title>
				<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="29" to="32" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Ynu@ dravidian-codemix-fire2020: Xlm-roberta for multi-language sentiment analysis</title>
		<author>
			<persName><forename type="first">X</forename><surname>Ou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Li</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">FIRE (Working Notes)</title>
				<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="560" to="565" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Multi-class sentiment analysis of urdu text using multilingual bert</title>
		<author>
			<persName><forename type="first">L</forename><surname>Khan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Amjad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ashraf</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H.-T</forename><surname>Chang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Scientific Reports</title>
		<imprint>
			<biblScope unit="volume">12</biblScope>
			<biblScope unit="page">5436</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Requirements classification using fasttext and beto in spanish documents</title>
		<author>
			<persName><forename type="first">M.-I</forename><surname>Limaylla-Lunarejo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Condori-Fernandez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">R</forename><surname>Luaces</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Working Conference on Requirements Engineering: Foundation for Software Quality</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="159" to="176" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">(???</title>
		<author>
			<persName><forename type="first">M</forename><surname>Fahim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">S</forename><surname>Shahriar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">R</forename><surname>Amin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Hatexplain space model: Fusing robustness with explainability in hate speech analysis</title>
				<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Aambela at blp-2023 task 2: Enhancing banglabert performance for bangla sentiment analysis task with in task pretraining and adversarial weight perturbation</title>
		<author>
			<persName><forename type="first">M</forename><surname>Fahim</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the First Workshop on Bangla Language Processing</title>
				<meeting>the First Workshop on Bangla Language Processing</meeting>
		<imprint>
			<publisher>BLP</publisher>
			<date type="published" when="2023">2023. 2023</date>
			<biblScope unit="page" from="317" to="323" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Edal: Entropy based dynamic attention loss for hatespeech classification</title>
		<author>
			<persName><forename type="first">M</forename><surname>Fahim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">A</forename><surname>Ali</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Amin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Rahman</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 37th Pacific Asia Conference on Language, Information and Computation</title>
				<meeting>the 37th Pacific Asia Conference on Language, Information and Computation</meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="775" to="785" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Long short-term memory</title>
		<author>
			<persName><forename type="first">S</forename><surname>Hochreiter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Neural computation</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="page" from="1735" to="1780" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Conneau</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Khandelwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Chaudhary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Wenzek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Guzmán</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Grave</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ott</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Stoyanov</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1911.02116</idno>
		<title level="m">Unsupervised cross-lingual representation learning at scale</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b18">
	<monogr>
		<title level="m" type="main">How language-neutral is multilingual bert?</title>
		<author>
			<persName><forename type="first">J</forename><surname>Libovickỳ</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rosa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Fraser</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1911.03310</idno>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">Spanish pre-trained bert model and evaluation data</title>
		<author>
			<persName><forename type="first">J</forename><surname>Cañete</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Chaperon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Fuentes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-H</forename><surname>Ho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Kang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pérez</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">PML4DC at ICLR</title>
				<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<author>
			<persName><forename type="first">N</forename><surname>Kokhlikyan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Miglani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Martin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Alsallakh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Reynolds</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Melnikov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Kliushkina</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Araya</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Reblitz-Richardson</surname></persName>
		</author>
		<idno>CoRR abs/2009.07896</idno>
		<ptr target="https://arxiv.org/abs/2009.07896" />
		<title level="m">Captum: A unified and generic model interpretability library for pytorch</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
