<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">UNED-NLP at eRisk 2022: Analyzing gambling disorders in Social Media using Approximate Nearest Neighbors</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Hermenegildo</forename><surname>Fabregat</surname></persName>
							<email>gildo.fabregat@lsi.uned.es</email>
							<affiliation key="aff0">
								<orgName type="department">Dpto. Lenguajes y Sistemas Informáticos</orgName>
								<orgName type="laboratory">NLP &amp; IR Group</orgName>
								<orgName type="institution">Universidad Nacional de Educación a Distancia (UNED)</orgName>
								<address>
									<addrLine>Juan del Rosal 16</addrLine>
									<postCode>28040</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Andres</forename><surname>Duque</surname></persName>
							<email>aduque@lsi.uned.es</email>
							<affiliation key="aff0">
								<orgName type="department">Dpto. Lenguajes y Sistemas Informáticos</orgName>
								<orgName type="laboratory">NLP &amp; IR Group</orgName>
								<orgName type="institution">Universidad Nacional de Educación a Distancia (UNED)</orgName>
								<address>
									<addrLine>Juan del Rosal 16</addrLine>
									<postCode>28040</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">IMIENS: Instituto Mixto de Investigación</orgName>
								<orgName type="institution">Escuela Nacional de Sanidad</orgName>
								<address>
									<addrLine>Monforte de Lemos 5</addrLine>
									<postCode>28019</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Lourdes</forename><surname>Araujo</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Dpto. Lenguajes y Sistemas Informáticos</orgName>
								<orgName type="laboratory">NLP &amp; IR Group</orgName>
								<orgName type="institution">Universidad Nacional de Educación a Distancia (UNED)</orgName>
								<address>
									<addrLine>Juan del Rosal 16</addrLine>
									<postCode>28040</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">IMIENS: Instituto Mixto de Investigación</orgName>
								<orgName type="institution">Escuela Nacional de Sanidad</orgName>
								<address>
									<addrLine>Monforte de Lemos 5</addrLine>
									<postCode>28019</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Juan</forename><surname>Martinez-Romo</surname></persName>
							<email>juaner@lsi.uned.es</email>
							<affiliation key="aff0">
								<orgName type="department">Dpto. Lenguajes y Sistemas Informáticos</orgName>
								<orgName type="laboratory">NLP &amp; IR Group</orgName>
								<orgName type="institution">Universidad Nacional de Educación a Distancia (UNED)</orgName>
								<address>
									<addrLine>Juan del Rosal 16</addrLine>
									<postCode>28040</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">IMIENS: Instituto Mixto de Investigación</orgName>
								<orgName type="institution">Escuela Nacional de Sanidad</orgName>
								<address>
									<addrLine>Monforte de Lemos 5</addrLine>
									<postCode>28019</postCode>
									<settlement>Madrid</settlement>
									<country key="ES">Spain</country>
								</address>
							</affiliation>
						</author>
						<author>
							<affiliation key="aff2">
								<orgName type="department">Evaluation Forum</orgName>
								<address>
									<addrLine>September 5-8</addrLine>
									<postCode>2022</postCode>
									<settlement>Bologna</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">UNED-NLP at eRisk 2022: Analyzing gambling disorders in Social Media using Approximate Nearest Neighbors</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">6EA1DF7C45B1048BE4FA7D4341546DFE</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-24T03:22+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Pathological gambling detection</term>
					<term>Approximate Nearest Neighbors</term>
					<term>Vector representations</term>
					<term>Relabeling</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This paper describes our proposal for tackling Task 1 (Early Detection of Signs of Pathological Gambling) from the CLEF 2022 eRisk Workshop. The challenge consists in the processing of messages written by Social Media users for the detection of early signs of pathological gambling. Our proposal is based on the calculation of Approximate Nearest Neighbors (ANN) performed on vectorial representations of the given messages. We introduce a relabeling process to modify the granularity of the labeling schema in the training dataset, thus converting it from the original user-based annotation to a message-based one. Our approach achieves the best average performance in the decision-based evaluation, as well as in the ranking-based evaluation. In addition, our system shows to be the fastest one in terms of time needed to process the whole test dataset. This indicates that the proposed relabeling scheme allows us to capture more easily the textual information that leads to a correct detection of pathological gambling.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>In the Internet era, social media analysis for the early detection of potential health risks is a particularly interesting research area. In this context, the different editions of the eRisk workshop, usually located within the Conference Labs of the Evaluation Forum (CLEF) since 2017, can be found among the efforts carried out by the scientific community. This workshop serves as a meeting point in which both methodologies and practical approaches have been developed for the early detection of different types of health risks, such as eating disorders, self-harm or depression, through the textual analysis of posts and messages of social media users.</p><p>In this paper we present a system for tackling Task 1 of the eRisk 2022 Workshop: Early Detection of Signs of Pathological Gambling <ref type="bibr" target="#b0">[1]</ref>. The approach first relies on generating vector- <ref type="bibr" target="#b0">[1]</ref><ref type="bibr" target="#b1">[2]</ref><ref type="bibr" target="#b2">[3]</ref><ref type="bibr" target="#b3">[4]</ref><ref type="bibr" target="#b4">[5]</ref><ref type="bibr" target="#b5">[6]</ref><ref type="bibr" target="#b6">[7]</ref><ref type="bibr" target="#b7">[8]</ref><ref type="bibr" target="#b8">[9]</ref><ref type="bibr" target="#b9">[10]</ref><ref type="bibr" target="#b10">[11]</ref> based representations of users messages through sentence embeddings, for subsequently detect positive messages using methods based on Approximate Nearest Neighbors (ANN) techniques. Although ANNs can be seen as a simple machine learning technique, we show in the paper how an adequate pre-processing of the training dataset based on the reduction of the original label granularity allows us to obtain the best overall results in the competition.</p><p>The rest of the paper is structured as follows: an overview of previous work related to the task considered and the techniques used in this work is shown in Section 2. Section 3 is devoted to describe the addressed task, including the available dataset and evaluation metrics, while the developed system is presented in Section 4. The achieved results are shown, compared to other participating systems, and discussed in Section 5. Finally, Section 6 presents the main conclusions and future lines of work.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related Work</head><p>Gambling disorder <ref type="bibr" target="#b1">[2]</ref> (GD) is characterized by a persistent and recurrent pattern of gambling that is associated with significant distress or substantial upset. The prevalence of GD has been estimated at 0.5% of the adult population in the United States, with comparable or even higher estimates in other countries.</p><p>People with GD are often not treated or even recognized as such. GD often co-occurs with other psychiatric disorders. High rates of mood, anxiety, attention deficit disorders and substance use disorders have been reported <ref type="bibr" target="#b2">[3]</ref> in people with GD. It is also often accompanied by a higher rate of unemployment, economic difficulties, divorce, and poorer health. In addition, GD is closely related to other addictive disorders, being the first non-substance addictive behavior to be recognized <ref type="bibr" target="#b3">[4]</ref>.</p><p>Social networks are an excellent source of information where studies can be carried out for the early detection of people with gambling problems. In this line, the eRisk competition considered the problem of pathological gambling for the first time in 2021 <ref type="bibr" target="#b4">[5]</ref>. Several systems participated in the shared task with different approaches: RELAI <ref type="bibr" target="#b5">[6]</ref>, UPV-Symamnto <ref type="bibr" target="#b6">[7]</ref>, BLUE <ref type="bibr" target="#b7">[8]</ref>, UNSL <ref type="bibr" target="#b8">[9]</ref>, and CEDRI <ref type="bibr" target="#b9">[10]</ref>. Considering the "test-only" nature of this first version of the task, several of these participating systems <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7,</ref><ref type="bibr" target="#b7">8,</ref><ref type="bibr" target="#b9">10]</ref> used external resources, such as posts from Reddit crawled by themselves, for training their systems. Most of them applied Transformer-based architectures <ref type="bibr" target="#b10">[11]</ref>, as well as other types of neural networks. The UNSL team obtained the best results using the Early Risk Detection Framework (ERD).</p><p>This year we participated for the first time in the competition on gambling disorder. Our system is based on a simple approach that has proven to be very effective. The idea is to carry out a re-labeling of users' messages using a method based on Approximate Nearest Neighbor (ANN) search. The exact nearest neighbor search (NNS) for the point corresponding to a given query is defined as the point corresponding to the shortest distance to the query. A generalization of the nearest neighbor search is the k-nearest neighbor search (k-NNS), which targets the k nearest vectors for the query. Due to the cost associated with dimensionality, many proposals have been developed focusing on the approximate solution of the NNS and k-NNS problem. A recent work <ref type="bibr" target="#b11">[12]</ref> has presented a comparison and evaluation of different approaches to the problem. According to this work, state-of-the-art ANN methods can be classified into three types: Hashing-based, Partition-based and Graph-based. Hashing-based methods transform data points to a low-dimensional representation, where each point is represented by a short code (hash code). Partition-based methods can be seen as the division of high-dimensional space into multiple disjoint regions. The partitioning process is usually done recursively, hence these methods often use a tree-or forest-based representation. We have used one of these methods in this work, Annoy <ref type="bibr" target="#b12">[13]</ref>, a hyperplane partitioning method that recursively divides the space by the hyperplane with random direction. Graph-based methods construct a proximity graph in which each datum corresponds to a node and the edges connecting some nodes define the neighborhood relationship. The main idea of these methods is that a neighbor's neighbor is likely to also be a neighbor. The search can be performed efficiently by iteratively extending neighbors of neighbors in a best-first search strategy. Depending on the structure of the graph, different graph-based methods can be distinguished. In this work we have used a method for Hierarchical Navigable Small World graphs <ref type="bibr" target="#b13">[14]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Task 1: Early Detection of Signs of Pathological Gambling</head><p>Task 1 of eRisk 2022 <ref type="bibr" target="#b0">[1]</ref> is denoted "Early detection of signs of pathological gambling". This is the second edition of the task, which was first introduced in the CLEF 2021 eRisk Workshop <ref type="bibr" target="#b4">[5]</ref>. In this task, participating systems are asked to determine whether an individual can be classified as a pathological gambler (positive users) or a non-pathological gambler (negative users) based on the user's Social Media messages. Systems must sequentially analyze chronological posts for each user for detecting early traces of pathological gambling.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Dataset</head><p>The dataset used in the task is composed of a set of XML documents, each of them containing chronologically ordered Social Media posts belonging to a particular user. The training dataset contains a total of 2,348 documents, each of them annotated as "1" (positive) if the user is labeled as a pathological gambler, and "0" (negative) otherwise.</p><p>The test dataset is provided through a server to which participants must connect to iteratively receive user writings. The total number of test users is 2,079 (81 pathological gamblers and 1,998 control users), with a maximum number of user writings of 2,001, while the average number of user writings is 495.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Metrics</head><p>System evaluation is twofold:</p><p>• Decision-based evaluation: This first type of evaluation aims to analyze the performance of the participating systems in terms of standard measures such as Precision, Recall and F-Measure. However, other metrics are also introduced in this evaluation that take into account the delay incurred by a system before it detects a true positive. Two of these metrics, denoted 𝐸𝑅𝐷𝐸 and 𝐸𝑅𝐷𝐸 𝑜 consider the number or the percentage of messages that have to be processed before emitting an alert of positive user. In order to overcome the low interpretability of these latter metrics, a latency-weighted F-Score is also introduced by multiplying the standard F-Measure by a penalty factor based on the median delay of true positive detection. • Ranking-based evaluation: The second type of evaluation is a complementary approach that requires the systems to provide a score indicating the risk of pathological gambling of a user every time a new message is analyzed. Users are then ranked using this score and standard ranking metrics such as 𝑃 @𝑘 or 𝑁 𝐷𝐶𝐺@𝑘 can be applied, with the parameter 𝑘 being the number of analyzed messages before evaluating the ranking.</p><p>More information about the complete set of metrics employed in the evaluation can be found in previous overviews of eRisk competitions <ref type="bibr" target="#b14">[15,</ref><ref type="bibr" target="#b4">5]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Proposed Model</head><p>Due to the large amount of information available in social networks, an approach based on Approximate Nearest Neighbors (ANN) has been proposed, being its main benefit its efficiency in processing large data collections. The following sections describe the main components of the proposed model and the configurations that have been explored.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Data representation</head><p>We use Universal Sentence Encoder <ref type="bibr" target="#b15">[16]</ref> to encode each user's messages. Such models are trained and optimized for encoding texts longer than words e.g. sentences, phrases or short paragraphs. The model we use is trained with a deep average network <ref type="bibr" target="#b16">[17]</ref> (DAN) using data from different sources in English. Although DAN approaches produce unordered representations of the information by averaging the terms in a given text, these models are able to capture subtle differences between similar texts. In short, for each message encoded by this model, a 512-dimensional vector is generated.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Approximate Nearest Neighbors</head><p>Although nearest neighbor retrieval is a conceptually simple procedure, in domains such as social networks, where a large amount of information is available, it is a difficult problem to address. In this domain the use of brute force based search techniques is replaced by the use of non-exact techniques based on the use of more complex structures e.g. graphs and trees. Currently there are different tools and approaches that have proven to be very successful when analyzing recall results and queries per second <ref type="bibr" target="#b17">[18]</ref>. Due to their popularity and performance we have explored the use of Annoy<ref type="foot" target="#foot_0">1</ref> and Non-Metric Space Library <ref type="bibr" target="#b13">[14]</ref> (NMSLIB):</p><p>• Annoy: This library uses tree-like structures for the representation of nodes and random projections for the division of the subspace between adjacent nodes. To explore this library, we have used a space generated by the inner-dot product of the 𝐿 2 normalized vectors generated by the Universal Sentence Encoder.</p><p>• NMSLIB: Library for approximate K-nearest neighbor search based on navigable smallworld graphs with controllable hierarchy (Hierarchical NSW, HNSW). For the calculation of similarity between instances NMSLIB supports the use of different metrics and data formats. In this sense, we explored a dense 𝐿 2 space.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Tag and scoring function</head><p>Once the training set was transformed using Universal Sentence Encoder, and after generating the nearest neighbor index using Annoy or NMSLIB libraries, we propose a labeling and scoring approach based on the classes of the neighbors retrieved for each message in the test set. Given a message 𝑀 from a user 𝑈 we classify 𝑈 𝑀 as positive if the 20 nearest neighbors retrieved correspond to messages from positive users. Following the same idea, we considered as scoring function the distance of 𝑈 𝑀 from the nearest recovered neighbors ( 1− ∑︀ 20 𝑥=1 𝑐𝑜𝑠𝑖𝑛𝑒(𝑈 𝑀 , 𝑀 𝑥 )). This number of 𝑘 = 20 nearest neighbors was set from a previous parameter tuning evaluation in which some different values of 𝑘 were explored.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.">Relabeling process</head><p>The corpus provided by the organizers presents a user-based labeling, i.e., each user is labeled as positive if at least a positive message can be found within his/her posts, and negative otherwise. However, positive/negative annotations for each message in the corpus are not provided. We consider that the correct classification of positive and negative messages is crucial for achieving a good performance in this task. Hence, we propose an approach to re-annotate the training corpus in order to generate a message-level labeling. For this purpose, we first consider all messages of a positive user to be positive, and all messages of a negative user to be negative. Once the k-nearest neighbor query index is generated, we iteratively process each message from each positive user of the training set, and re-annotate its class according to the above-mentioned labeling algorithm. We assume that only positive users may contain negative messages, since if negative users contained positive messages, they would have been labeled as positive. Hence, in each iteration of the algorithm, the number of positive messages is reduced if the algorithm re-labels them as negative. After processing the training set, if modifications have been made, the same method is applied again until convergence is reached, this is, until there are no changes in the training set labels.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.5.">Crawling new positive instances</head><p>In order to reduce the impact on recall that the relabeling algorithm could have, the following data were collected from gamblers' help associations:</p><p>• Testimonial facts: A total of 234 testimonials were collected from websites<ref type="foot" target="#foot_1">2</ref> containing information about pathological gamblers and their friends and family. Unlike the Reddit posts, these new data are more carefully structured and contain longer texts.</p><p>• Forums: Messages from a forum devoted to help players <ref type="foot" target="#foot_2">3</ref> were automatically collected and those potentially positive messages were selected using the proposed system. Finally, we included in the training set those messages classified as positive by the system. In short, a total of 232 new instances were added.</p><p>Analyzing the format of the corpus texts, the instances extracted from the forums present a similar format and structure. No specific pre-processing techniques such as text size limitation or language control have been added, e.g., no text size limitation, no language control.</p><p>As shown in Table <ref type="table" target="#tab_0">1</ref>, we submitted 5 different configurations, in which we tried to explore combinations of the previously mentioned different aspects of the proposed approach. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Results and Discussion</head><p>The results obtained by our approach are shown and discussed below.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Execution time:</head><p>In order to avoid possible errors during the test phase due to power or network failures, we processed the test data on a shared server with two Intel(R) Xeon(R) CPUs E5-2630 v4 @ 2.20GHz and 64 GB of RAM. As can be seen in Table <ref type="table" target="#tab_1">2</ref>, the proposed batch of experiments achieved the best execution times among the systems that processed the whole test set. These results were influenced using non-exhaustive nearest-neighbor recovery algorithms. Although we presented runs using different algorithms, all of them are oriented to the processing of large datasets and include optimizations for this purpose. While Annoy uses tree-like structures for the representation of nodes and random projections for the division of the subspace between adjacent nodes, NMSLIB uses a graph-based structure and the projection of the different nodes onto a skip-list. Both algorithms include customizable parameters to optimize their performance, e.g. number of trees (Annoy) or number of Zero node links (NMSLIB). Although we do not perform an exhaustive study of these parameters, we try to limit their growth. The final configuration for each of the algorithms is as follows:</p><p>• Annoy</p><formula xml:id="formula_0">1-11</formula><p>-Trees 24 • NMSLIB -index_params {'M': 200, 'efConstruction': 1000, 'post': 2} -method 'hnsw' -efSearch 100 Finally, although they are not included in this comparison, our system also achieved execution time results that were below many systems that processed the test set only partially. Decision-based performance: Table <ref type="table" target="#tab_2">3</ref> shows the results obtained during the decision-based evaluation. This table shows the set of metrics analyzed by the task organizers: Precision, Recall, 𝐹 1, ERDE 5 , ERDE 50 , latency, speed and latency-weigthed 𝐹 1. In addition to the results of our runs, the best run of each team participating in the competition is shown. As it can be seen in the table, considering the latency-weighted 𝐹 1 metric as the summary metric, our R4 configuration obtained the best results, achieving the highest precision/recall ratio. If we analyze the achieved results in terms of latency, i.e., delay shown by the system expressed as the median number of messages that need to be processed before detecting a positive case, as we used the same inference process in all the runs, no great differences can be found between the different submitted runs. However, if we compare runs R0 and R1, which are differentiated by the application of the relabelling process in R1, we find improvements in precision of around 27% with no excessive penalization of other metrics such as recall. The relabeling process presents a high impact on the corpus since the label of more than 90% of the positive instances is modified after applying it. Considering the amount of discarded information and the improvements obtained through this approach, the analysis of the filtered messages can be of great value to achieve a better understanding of the problem. On the other hand, and seeking to reduce the effect on recall produced by the relabelling process, the inclusion of new data automatically collected was considered in the R2 and R3 runs. The obtained results indicate that our approach to collect and process the new data was not the most efficient one. Finally, R1 and R4 differ by the algorithm for nearest neighbor retrieval used (R1: Annoy, R4: NMSLIB). These algorithms include a parameter space that has not been studied in depth. For this reason, and although the NMSLIB algorithm performs significatively better than Annoy, we consider that a more thorough study on the parameters of the latter technique should be performed before discarding its use. Ranking-based performance: Table <ref type="table">4</ref> shows the results obtained in the ranking-based evaluation. During this evaluation, the performance of the system is measured after processing 1, 100, 500 and 1000 messages. As shown in the Table, the R4 run obtains the best results during this evaluation for all metrics in almost all stages. Comparing the differences between R4 and the best runs presented by BLUE and UNSL, our system outperforms in most aspects except for NDCG@100 when analyzing 1 and 100 writings. This results indicate that the scoring function described in Section 4.3 is an effective heuristic for assessing the risk of pathological gambling after processing each user message.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 4</head><p>Test results: Results of the ranking-based evaluation for task T1. For the models included in the comparison, the best results are shown in bold.</p><p>1 writing 100 writings 500 writings 1000 writings P@10 NDCG@10 NDCG@100 P@10 NDCG@10 NDCG@100 P@10 NDCG@10 NDCG@100 P@10 NDCG@10 NDCG@100 Run 0 0.9 0.88 0.75 0.4 0.29 0.7 0. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusions and Future Work</head><p>This article describes our proposed approach for early detection of signs of pathological gambling addressed in Task 1 of eRisk 2022 <ref type="bibr" target="#b0">[1]</ref>. The main contributions presented in this work include the use of Approximate Nearest Neighbor algorithms for retrieving subsets of similar messages previously transformed into a vectorial space using sentence embeddings, as well as the development of a relabeling technique successfully applied to the training set. The use of algorithms such as Annoy or NMSLIB for large scale nearest neighbor retrieval has been of great help for the fast processing of the data. As shown in Table <ref type="table" target="#tab_1">2</ref> and having processed all the messages from the test set, our system obtained the best execution times. On the other hand, as shown in Tables <ref type="table" target="#tab_2">3 and 4</ref>, our model has obtained the best results for the 𝐹 1, ERDE 50 and 𝐹 -latency metrics in the decision-based evaluation, as well as the best overall results in the ranking-based evaluation. Most of these results are due to the application of the iterative re-labeling process of the corpus described in Section 4.4 and based on the use of the system itself. Through this process we have also validated the use of the vector space generated by Universal Sentence Encoder to analyze the similarity between messages of different classes.</p><p>The following lines of future work are being currently considered: study of encoders based on more complex approaches such as BERT <ref type="bibr" target="#b18">[19]</ref>, or trained with in-domain information; deeper exploration of the parameters used for the construction of the ANN index; analysis of the impact of different thresholds within the scoring function in the ranking-based evaluation (e.g. distance of retrieved neighbors); and application of the proposed system to similar tasks.</p><p>Finally, we believe that an analysis of the identified positive messages would be of great value. Theoretically, these messages should exhibit easily identifiable features and characteristics that can help in the profiling of this type of pathology.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Submitted Runs: Description of the configurations explored in the test phase. Universal Sentence Encoder has been used as encoder while Annoy and Non-Metric Space Library (NMSLIB) have been explored as methods for k-nearest neighbor retrieval. On the other hand, we studied a relabeling process of the training set and the consideration of new data collected automatically.</figDesc><table><row><cell></cell><cell cols="3">ANN Library Relabeling New data</cell></row><row><cell>UNED-NLP Run 0</cell><cell>Annoy</cell><cell>No</cell><cell>No</cell></row><row><cell>UNED-NLP Run 1</cell><cell>Annoy</cell><cell>Yes</cell><cell>No</cell></row><row><cell>UNED-NLP Run 2</cell><cell>Annoy</cell><cell>No</cell><cell>Yes</cell></row><row><cell>UNED-NLP Run 3</cell><cell>Annoy</cell><cell>Yes</cell><cell>Yes</cell></row><row><cell>UNED-NLP Run 4</cell><cell>NMSLIB</cell><cell>Yes</cell><cell>No</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Test results: Comparison of the execution times required by those systems that processed the whole test set.</figDesc><table><row><cell>Team</cell><cell cols="3">#runs #user writings processed lapse of time (from 1st to last response)</cell></row><row><cell>UNED-NLP</cell><cell>5</cell><cell>2001</cell><cell>17:58:48</cell></row><row><cell>BLUE</cell><cell>3</cell><cell>2001</cell><cell>3 days 13:15:25</cell></row><row><cell>UNSL</cell><cell>5</cell><cell>2001</cell><cell>1 day 21:53:51</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>Test results: Results of the decision-based evaluation for task T1. For the models included in the comparison, the best results are shown in bold.</figDesc><table><row><cell></cell><cell>Prec</cell><cell>Rec</cell><cell>F1</cell><cell cols="4">ERDE5 ERDE50 latency speed</cell><cell>latency-weighted F1</cell></row><row><cell>UNED-NLP R0</cell><cell cols="3">0.285 0.975 0.441</cell><cell>0.019</cell><cell>0.010</cell><cell>2.0</cell><cell>0.996</cell><cell>0.4405</cell></row><row><cell>UNED-NLP R1</cell><cell cols="3">0.555 0.938 0.697</cell><cell>0.019</cell><cell>0.009</cell><cell>2.5</cell><cell>0.994</cell><cell>0.693</cell></row><row><cell>UNED-NLP R2</cell><cell cols="3">0.296 0.988 0.456</cell><cell>0.019</cell><cell>0.009</cell><cell>2.0</cell><cell>0.996</cell><cell>0.454</cell></row><row><cell>UNED-NLP R3</cell><cell cols="3">0.536 0.926 0.679</cell><cell>0.019</cell><cell>0.009</cell><cell>3.0</cell><cell>0.992</cell><cell>0.673</cell></row><row><cell>UNED-NLP R4</cell><cell cols="3">0.809 0.938 0.869</cell><cell>0.020</cell><cell>0.008</cell><cell>3.0</cell><cell>0.992</cell><cell>0.862</cell></row><row><cell>SINAI R2</cell><cell cols="3">0.908 0.728 0.808</cell><cell>0.016</cell><cell>0.011</cell><cell>1.0</cell><cell>1.000</cell><cell>0.808</cell></row><row><cell>BioInfo_UAVR R1</cell><cell cols="3">0.067 1.000 0.126</cell><cell>0.047</cell><cell>0.024</cell><cell>5.0</cell><cell>0.984</cell><cell>0.124</cell></row><row><cell>RELAI R2</cell><cell cols="3">0.052 0.963 0.099</cell><cell>0.036</cell><cell>0.029</cell><cell>1.0</cell><cell>1.000</cell><cell>0.099</cell></row><row><cell>BLUE R0</cell><cell cols="3">0.260 0.975 0.410</cell><cell>0.015</cell><cell>0.009</cell><cell>1.0</cell><cell>1.000</cell><cell>0.410</cell></row><row><cell>BioNLP_UniBuc R4</cell><cell cols="3">0.046 1.000 0.089</cell><cell>0.032</cell><cell>0.031</cell><cell>1.0</cell><cell>1.000</cell><cell>0.089</cell></row><row><cell>UNSL R1</cell><cell cols="3">0.461 0.938 0.618</cell><cell>0.041</cell><cell>0.008</cell><cell>11</cell><cell>0.961</cell><cell>0.594</cell></row><row><cell cols="4">NLPGroup-IISERB R3 0.140 1.000 0.246</cell><cell>0.025</cell><cell>0.014</cell><cell>2.0</cell><cell>0.996</cell><cell>0.245</cell></row><row><cell>stezmo3 R4</cell><cell cols="3">0.160 0.901 0.271</cell><cell>0.043</cell><cell>0.011</cell><cell>7.0</cell><cell>0.977</cell><cell>0.265</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">https://github.com/spotify/annoy</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">https://gamblershelp.com.au/learn-about-gambling/personal-stories/; http://getgamblingfacts.ca/personalstories/; https://www.gamtalk.org/stories-of-hope/; https://www.gamcare.org.uk/understanding-gamblingproblems/people-weve-helped/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">https://www.gamtalk.org/groups/community/</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This work has been partially supported by the Spanish Ministry of Science and Innovation within the DOTT-HEALTH Project (MCI/AEI/FEDER, UE) under Grant PID2019-106942RB-C32, as well as project RAICES (IMIENS 2022) and the research network AEI RED2018-102312-T (IA-Biomed).</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Overview of erisk 2022: Early risk prediction on the internet</title>
		<author>
			<persName><forename type="first">J</forename><surname>Parapar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Martín Rodilla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">E</forename><surname>Losada</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Crestani</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Experimental IR Meets Multilinguality, Multimodality, and Interaction. 13th International Conference of the CLEF Association, CLEF 2022</title>
				<meeting><address><addrLine>Bologna, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Gambling disorder</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">N</forename><surname>Potenza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><forename type="middle">M</forename><surname>Balodis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Derevensky</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Grant</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">M</forename><surname>Petry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Verdejo-Garcia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">W</forename><surname>Yip</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Nature reviews Disease primers</title>
		<imprint>
			<biblScope unit="volume">5</biblScope>
			<biblScope unit="page" from="1" to="21" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Pathological gambling</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">N</forename><surname>Potenza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">R</forename><surname>Kosten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">J</forename><surname>Rounsaville</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Jama</title>
		<imprint>
			<biblScope unit="volume">286</biblScope>
			<biblScope unit="page" from="141" to="144" />
			<date type="published" when="2001">2001</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">A review of gambling disorder and substance use disorders</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Rash</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Weinstock</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Van Patten</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Substance abuse and rehabilitation</title>
		<imprint>
			<biblScope unit="volume">7</biblScope>
			<biblScope unit="page">3</biblScope>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Overview of erisk at CLEF 2021: Early risk prediction on the internet (extended overview)</title>
		<author>
			<persName><forename type="first">J</forename><surname>Parapar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Martín-Rodilla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">E</forename><surname>Losada</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Crestani</surname></persName>
		</author>
		<ptr target="http://ceur-ws.org/Vol-2936/paper-72.pdf" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Working Notes of CLEF 2021 -Conference and Labs of the Evaluation Forum</title>
				<meeting>the Working Notes of CLEF 2021 -Conference and Labs of the Evaluation Forum<address><addrLine>Bucharest, Romania</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2021">2021. 2936. 2021</date>
			<biblScope unit="page" from="864" to="887" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Early detection of signs of pathological gambling, self-harm and depression through topic extraction and neural networks</title>
		<author>
			<persName><forename type="first">D</forename><surname>Maupomé</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">D</forename><surname>Armstrong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Rancourt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Soulas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-J</forename><surname>Meurs</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Working Notes of CLEF</title>
				<meeting>the Working Notes of CLEF</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Basile</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chinea-Rios</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A.-S</forename><surname>Uban</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Rössler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yenikent</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chulví</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Rosso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Franco-Salvador</surname></persName>
		</author>
		<title level="m">Upv-symanto at erisk 2021: Mental health author profiling for early risk prediction on the internet</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note>Working Notes of CLEF</note>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Early risk detection of pathological gambling, self-harm and depression using bert</title>
		<author>
			<persName><forename type="first">A.-M</forename><surname>Bucur</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cosma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">P</forename><surname>Dinu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Unsl at erisk 2021: A comparison of three early alert policies for early risk detection</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Loyola</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Burdisso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Thompson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Cagnina</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Errecalde</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2021-Conference and Labs of the Evaluation Forum</title>
				<meeting><address><addrLine>Bucarest, Romania</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Cedri at erisk 2021: A naive approach to early detection of psychological disorders in social media</title>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">P</forename><surname>Lopes</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CEUR Workshop Proceedings, CEUR Workshop Proceedings</title>
				<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="981" to="991" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<title level="m" type="main">Attention is all you need</title>
		<author>
			<persName><forename type="first">A</forename><surname>Vaswani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Parmar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Jones</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">N</forename><surname>Gomez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Polosukhin</surname></persName>
		</author>
		<idno>CoRR abs/1706.03762</idno>
		<ptr target="http://arxiv.org/abs/1706.03762.arXiv:1706.03762" />
		<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Approximate nearest neighbor search on high dimensional data-experiments, analyses, and improvement</title>
		<author>
			<persName><forename type="first">W</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Lin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Knowledge and Data Engineering</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="1475" to="1488" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<title level="m" type="main">Annoy: Approximate Nearest Neighbors in C++/Python</title>
		<author>
			<persName><forename type="first">E</forename><surname>Bernhardsson</surname></persName>
		</author>
		<ptr target="https://pypi.org/project/annoy/,pythonpackageversion1.13.0" />
		<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<title level="m" type="main">Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs</title>
		<author>
			<persName><forename type="first">Y</forename><forename type="middle">A</forename><surname>Malkov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">A</forename><surname>Yashunin</surname></persName>
		</author>
		<idno>CoRR abs/1603.09320</idno>
		<ptr target="http://arxiv.org/abs/1603.09320.arXiv:1603.09320" />
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Overview of erisk at CLEF 2020: Early risk prediction on the internet (extended overview)</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">E</forename><surname>Losada</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Crestani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Parapar</surname></persName>
		</author>
		<ptr target="http://ceur-ws.org/Vol-2696/paper_253.pdf" />
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2020 -Conference and Labs of the Evaluation Forum</title>
				<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
			<biblScope unit="page">2696</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<title level="m" type="main">Universal sentence encoder</title>
		<author>
			<persName><forename type="first">D</forename><surname>Cer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Kong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Hua</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Limtiaco</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">S</forename><surname>John</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Constant</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Guajardo-Cespedes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Tar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Sung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Strope</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Kurzweil</surname></persName>
		</author>
		<idno>CoRR abs/1803.11175</idno>
		<ptr target="http://arxiv.org/abs/1803.11175.arXiv:1803.11175" />
		<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Deep unordered composition rivals syntactic methods for text classification</title>
		<author>
			<persName><forename type="first">M</forename><surname>Iyyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Manjunatha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Boyd-Graber</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Daumé</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Iii</forename></persName>
		</author>
		<idno type="DOI">10.3115/v1/P15-1162</idno>
		<ptr target="https://aclanthology.org/P15-1162.doi:10.3115/v1/P15-1162" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing</title>
		<title level="s">Long Papers</title>
		<meeting>the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing<address><addrLine>Beijing, China</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2015">2015</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="1681" to="1691" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Aumüller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Bernhardsson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">J</forename><surname>Faithfull</surname></persName>
		</author>
		<idno>CoRR abs/1807.05614</idno>
		<ptr target="http://arxiv.org/abs/1807.05614.arXiv:1807.05614" />
		<title level="m">Ann-benchmarks: A benchmarking tool for approximate nearest neighbor algorithms</title>
				<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">BERT: pre-training of deep bidirectional transformers for language understanding</title>
		<author>
			<persName><forename type="first">J</forename><surname>Devlin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Toutanova</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/n19-1423</idno>
		<ptr target="https://doi.org/10.18653/v1/n19-1423.doi:10.18653/v1/n19-1423" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019</title>
				<editor>
			<persName><forename type="first">J</forename><surname>Burstein</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Doran</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">T</forename><surname>Solorio</surname></persName>
		</editor>
		<meeting>the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019<address><addrLine>Minneapolis, MN, USA</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019">June 2-7, 2019. 2019</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="4171" to="4186" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
