<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Keyphrase extraction from Slovak court decisions</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Dávid</forename><surname>Varga</surname></persName>
							<email>david.varga@student.upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Šimon</forename><surname>Horvát</surname></persName>
							<email>simon.horvat@student.upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Zoltán</forename><surname>Szoplák</surname></persName>
							<email>zoltan.szoplak@student.upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ľubomír</forename><surname>Antoni</surname></persName>
							<email>lubomir.antoni@upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Stanislav</forename><surname>Krajči</surname></persName>
							<email>stanislav.krajci@upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Peter</forename><surname>Gurský</surname></persName>
							<email>peter.gursky@upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Laura</forename><forename type="middle">Bachňáková</forename><surname>Rózenfeldová</surname></persName>
							<email>laura.rozenfeldova@upjs.sk</email>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Faculty of Science</orgName>
								<orgName type="department" key="dep2">Institute of Computer Science</orgName>
								<orgName type="institution">Pavol Jozef Šafárik University in Košice</orgName>
								<address>
									<addrLine>Jesenná 5</addrLine>
									<postCode>040 01</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Keyphrase extraction from Slovak court decisions</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">EC214530B15D470AEAE8A79A1906796A</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-24T02:15+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>keyphrase</term>
					<term>keyword</term>
					<term>extraction</term>
					<term>legal text</term>
					<term>word network</term>
					<term>embedding</term>
					<term>court decision</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Keyphrase extraction is a vital subtask of text summarization and comparison, through which we can obtain the most relevant set of words and phrases that describe the content of a given document. In this paper we test multiple approaches of unsupervised keyword extraction on a set of court decisions. These approaches are TF-IDF, YAKE! and a graph-based weighted PageRank algorithm. We combine these algorithms with a dictionary-based word embedding method in order to capture the semantic relationships between the potential keyphrases. Extracted keyphrases can be used for semantic indexing of court decisions, which can help with finding decisions with similar content.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>In their decision-making, judges need to ensure the consistency of decisions with the standard practice of courts. Getting an overview of similar relevant court decisions is a time-consuming process. Currently, available tools have limited options for filtering a set of all decisions, often resulting in an extensive collection of documents. In the Slovak court system, only the Supreme Court has an analytical department that has human resources to create overviews of relevant court decisions for judges. With a vast number of court cases, common judges often do not have time and resources to get to all relevant documents, which can cause essential decisions to be overlooked by judges. The analytical department of the Supreme Court manually creates metadata to all Supreme Court decisions, including keyphrases, to speed up the overview-making process, especially by narrowing the search results down to a reasonable size. Automatic keyphrase extraction can help with manual annotation by providing hints, thus making the annotation process semi-automatic and faster. This increases the number of court decision annotations that can be used for searching and filtering.</p><p>In the field of natural language processing, automatic keyphrase extraction can be used as a form of text summarization. Manually extracting keyphrases consists of reading the whole document, understanding its content and selecting the phrases used, or generating phrases that aptly describe the document. Manual extraction of keyphrases from long texts or from a large number of texts is time-consuming and demanding on human resources. These are the reasons why it is appropriate to automate this process. The process of automated extraction of keyphrases consists of selecting candidate phrases from a document or external source, which are evaluated according to how well they describe the document. An evaluation algorithm is used to evaluate the candidate phrases, which calculates the score according to statistics, semantics, or both at the same time. The candidate phrases with the highest score are then selected as keyphrases.</p><p>Keyphrase extraction algorithms are divided into two main groups, supervised and unsupervised algorithms. We can train supervised algorithms on a labeled dataset, while the resulting models often achieve high accuracy <ref type="bibr" target="#b0">[1]</ref>. If a dataset that is labeled with keyphrases is not available, it is advisable to use unsupervised algorithms. These types of algorithms usually uses statistical metrics that take into account the number of occurrences of phrases, the co-occurrence of phrases, the position of phrases within the document and others. These algorithms are often combined with graph algorithms, word embeddings, or other language models.</p><p>In this article, we will focus on the extraction of keyphrases from Slovak court decisions. This dataset does not contain manually extracted keyphrases, therefore we decided to use a combination of unsupervised statistical and semantic approaches.</p><p>The objectives of this article are:</p><p>• design and implementation of an algorithm for extracting keyphrases from Slovak court decisions; • evaluation of the results of extracted keyphrases on a set of court decisions.</p><p>This article is organized into four sections. In Section 2, we describe the related approaches to automated keyphrase extraction and other works related to legal document processing. In Section 3, we propose the multiple algorithms to extract the keyphrases from Slovak court decisions. Finally, we analyze the results of the algorithms in Section 4.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related works</head><p>A lot of research has been done on applying NLP techniques to law texts and court decisions. NLP techniques are used in different tasks, for example: predicting the outcomes of court decisions <ref type="bibr" target="#b1">[2,</ref><ref type="bibr" target="#b2">3,</ref><ref type="bibr" target="#b3">4]</ref>, searching for insufficiently reasoned court decisions <ref type="bibr" target="#b4">[5]</ref>, creating electronic versions of court decisions <ref type="bibr" target="#b5">[6]</ref> or creating a collection of datasets for evaluating performance across different legal text understanding tasks <ref type="bibr" target="#b6">[7]</ref>.</p><p>An international voluntary association called the Free Access to Law Movement (FALM) <ref type="bibr" target="#b7">[8]</ref> was founded in 1992 and has more than 60 member organisations from around the globe. FALM members provide free access to legal information, group legal documents into one place and analyse law texts. FALM member CanLII [9] uses software to process canadian court decisions. CanLII creates links to articles that are used in court decisions and to other court decisions used as citations. This software also creates a short description of the court decision and selects keyphrases. These can be used to save time and effort for legal experts such as judges and lawyers.</p><p>Algorithms used for legal text summarization are summed up in a survey paper <ref type="bibr" target="#b8">[10]</ref>. However, in this section, we focus specifically on keyphrase extraction by use of statistical approaches and unsupervised algorithms. We also summarize the principles of selecting appropriate keyphrases based on observations.</p><p>The simplest approach to select keyphrases is to count the n-grams in the text and select the most common ngrams <ref type="bibr" target="#b9">[11]</ref>. This approach is also called Bag of Words or BoW and does not take into account synonyms, grammar or the meaning of individual n-grams. The downside of using a BoW approach is that it does not select those keyphrases that are concise to the text and at the same time occur rarely in the text.</p><p>A significant improvement over the BoW method is TF-IDF <ref type="bibr" target="#b10">[12]</ref>. TF-IDF takes into account the whole corpus and penalizes phrases that occur in many documents. It is often used as a baseline method or in one of the steps of an algorithm, for example KP-Miner <ref type="bibr" target="#b11">[13]</ref> or Liu's clustering algorithm <ref type="bibr" target="#b12">[14]</ref>. We will describe TF-IDF in more detail in the next chapter.</p><p>Three desirable properties of keyphrases are described in <ref type="bibr" target="#b12">[14]</ref>:</p><p>• Understandable. Keyphrases should be easy to understand. • Relevant. Keyphrases should relate to the main topic of the document. • Good coverage. Keyphrases should cover all parts of the document appropriately.</p><p>According to these properties, the Liu's clustering algorithm <ref type="bibr" target="#b12">[14]</ref> was created, which used statistical, semantic and clustering methods simultaneously. The first step of the algorithm was to search for candidate words. From these, keyphrases of several words will be composed in the next steps of the algorithm. Subsequently, the candidate phrases were calculated semantic closeness scores, according to their common occurrences within a fixedlength window and also according to an external source -Wikipedia. For each word, they created an embedding, where on each index of the vector, a value representing the relationship between the word and a specific article from Wikipedia was calculated using TF-IDF. Candidate words were clustered according to semantic closeness, which grouped semantically similar words into individual clusters. Subsequently, exemplary words representing the entire cluster were selected from individual clusters, which had to be extended to phrases composed of several words. The keyphrases for the document were selected so that the algorithm processed all the words of the document, and if the word type was a noun that was also an exemplary word, then the word was selected in the list of keyphrases along with adjectives in its neighbourhood in the original text. One of the latest language-independent unsupervised keyphrase extraction algorithms is YAKE! <ref type="bibr" target="#b13">[15]</ref>. It uses statistical information, such as word counts and word occurrences, to identify keyphrases in unstructured texts. Its great advantage is that it only works with the current document during extraction, so it is not necessary to have the whole corpus of similar texts or other text sources available. The algorithm consists of five steps: <ref type="bibr" target="#b0">(1)</ref> preprocessing the document into a machine-readable format, which results in tagged individual words; (2) for each word, a representation is created consisting of a set of properties evaluated by statistical measurements; (3) the individual properties of the words are heuristically combined into one score, which represents the importance of the word; (4) generating n-grams from candidate words and assigning a degree of relevance; (5) deduplication of keyphrases that are too similar and ranking by relevance.</p><p>Another approach to extracting keyphrases is to use graphs and graph algorithms. The text may be represented by a graph such that the vertices of the graph are candidate phrases and the edges represent the relationship between these phrases. Subsequently, a value for each vertex of the graph is assigned using the selected evaluation function, and the edges and their weights are used to calculate this value. Thus, the individual methods differ in the use of different types of graphs and evaluation functions. One of the first algorithms to extract keywords from a text that uses a graph is TextRank <ref type="bibr" target="#b14">[16]</ref>, which has inspired a number of other graph-based algorithms. Its evaluation function calculates the values for the vertices of the oriented graph recursively and the information at the input of this function is global, ie in each step, it comes from the whole graph. The evaluation function used is the PageRank <ref type="bibr" target="#b15">[17]</ref> algorithm, which is iterative and its input is the oriented graph. PageRank was originally designed for scoring web pages by importance on the web, but in TextRank it is used to give score to candidate keyphrases.</p><p>RAKE <ref type="bibr" target="#b16">[18]</ref> is another graph-based unsupervised algorithm and it uses word frequency and word co-occurence to create a graph and assign scores to phrases. It needs a list of stop-words and delimiters at the input, but it is able to identify interior stop-words in phrases.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Methods</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Background knowledge</head><p>In this section we describe how we mine knowledge from sources other than the document from which we want to extract key phrases. This background knowledge is used as weighting mechanism in methods described in the next section.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.1.">Term frequency -inverse document frequency.</head><p>TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This measure is multiplication of two metrics:</p><p>1. term frequency expresses how many times a word appears in a document, 2. the inverse document frequency expresses how unique a given word is to a document. It is the frequency of the word across a set of all documents:</p><formula xml:id="formula_0">idf(𝑡, 𝐷) = log |𝐷| |{𝑑 ∈ 𝐷 ∶ 𝑡 ∈ 𝑑}|</formula><p>where |{𝑑 ∈ 𝐷 ∶ 𝑡 ∈ 𝑑}| is the number of documents where the term 𝑡 appears.</p><p>So idf examines the frequency values in all documents to reduce the impact of frequent words.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.">Phrase network.</head><p>To use keyphrase extraction methods it is in our best interest to develop a vocabulary of potential keyphrases. Keyphrase extraction methods such as TF-IDF prioritize keyphrases that are unique to a specific document that might not be suitable for the purposes of topic clustering. Using this vocabulary as a basis for creating phrase embeddings can help us semantically compare the keyphrases in order to facilitate better keyphrase selection.</p><p>Let 𝑉 be the set of all unigrams, bigrams and trigrams used in court decisions documents <ref type="foot" target="#foot_0">1</ref> . Relations among phrases in 𝑉 are denoted as set of 𝐸. We mine these relations mainly from Slovak Law Thesaurus (SLT) <ref type="bibr" target="#b17">[19]</ref> as follows:</p><p>1. Let phrase 𝑖 and phrase 𝑗 be words or phrases defined in SLT. In case that phrase 𝑗 occurs in definition of phrase 𝑖 , we expand our set 𝐸 by the pair (phrase 𝑖 , phrase 𝑗 ). 2. Let phrase 𝑖 be word or phrase defined in SLT. Let {phrase 1 , phrase 2 , … , phrase 𝑗 } ⊆ 𝑉 be the words used in definition of phrase 𝑖 , but without definition in SLT. We add pairs (phrase 𝑖 ,phrase 1 ), (phrase 𝑖 ,phrase 2 ), … , (phrase 𝑖 ,phrase 𝑗 ) to the set 𝐸. Not all words appearing in the definition are related to a defined phrase, therefore we weigh these relations with the TF-IDF used in our global weight function. The set of documents used for IDF calculation 𝐷 is the set of all definitions from SLT. 3. Let phrase 𝑖 ∈ 𝑉 be a phrase that is not found in SLT. We find the definitions of individual words that make up the phrase in the Dictionary of Slovak language and continue as in the previous step. Let {phrase 1 , phrase 2 , … , phrase 𝑗 } ⊆ 𝑉 be the words used in definition of phrase 𝑖 . We add pairs (phrase 𝑖 ,phrase 1 ), (phrase 𝑖 ,phrase 2 ), … , (phrase 𝑖 ,phrase 𝑗 ) to the set 𝐸. The set of documents used for IDF calculation 𝐷 is the set of all definitions from Dictionary of Slovak language.</p><p>Using this set of definitions, we model a network of legal phrases defined as follows:</p><p>Let 𝐺 = (𝑉 , 𝐸, 𝜙) be a directed evaluated graph, where 𝜙 ∶ 𝐸 → 𝑅 is a function:</p><formula xml:id="formula_1">𝜙(𝑒) = { 1, if 𝑒 gained in 1 tf-idf(phrase 𝑖 , phrase 𝑗 ), if 𝑒 gained in 2 ∨ 3</formula><p>such that 𝑒 = (phrase 𝑖 , phrase 𝑗 ), phrase 𝑖 ∈ 𝑉 is defined phrase, phrase 𝑗 ∈ 𝑉 is phrase occurring in definition of phrase 𝑖 and 𝑒 ∈ 𝐸.</p><p>In the next step, we use the graph embedding techniques described in <ref type="bibr" target="#b18">[20]</ref> which produce a semantic representation for each phrase from 𝑉. In our approach, we use the Node2Vec algorithm, described in <ref type="bibr" target="#b19">[21]</ref> which is one of the graph embedding techniques based on a random walk. These vectors with semantic interpretation are used as background knowledge for the algorithms described below. A detailed description of the method for obtaining embeddings is described in <ref type="bibr" target="#b20">[22]</ref>.</p><p>Suppose we need embedding for a phrase 2 consisting of more than one word. We compute it as an elementwise average of all word embedding occurring in the phrases.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Weighted PageRank</head><p>In order to incorporate our vocabulary and embeddings, we can use a keyphrase selection method described in <ref type="bibr" target="#b21">[23]</ref> in conjunction with our phrase embeddings.</p><p>First, we create an undirected weighted graph representing a given court decision, with each node corresponding to a phrase of the decision present in our vocabulary 𝑉. A pair of nodes 𝑣 1 and 𝑣 2 , each representing a potential keyphrase, will be connected by an edge if they are located within a fixed-size sliding window. The weight of these edges represents the similarity between the potential keyphrases that make up its nodes. This similarity is defined by two metrics. One of them is the dice coefficient which measures the interlinkedness of the two phrases. It is calculated as the number of times the phrases appear in the decision as a tuple, divided by the sum of frequencies of phrases individually:</p><formula xml:id="formula_2">dice(𝑣 𝑖 , 𝑣 𝑗 ) = 2 × freq(𝑣 𝑖 , 𝑣 𝑗 ) freq(𝑣 𝑖 ) + freq(𝑣 𝑗 )<label>(1)</label></formula><p>where 𝑣 𝑖 and 𝑣 𝑗 are vertices connected by an edge, 𝑓 𝑟𝑒𝑞(𝑣 𝑖 ) is the number of times the vertex 𝑣 𝑖 appears in the document, and freq(𝑣 𝑖 , 𝑣 𝑗 ) is the frequency where the vertices 𝑣 𝑖 and 𝑣 𝑗 form a tuple, in whichever order.</p><p>The second metric is inspired by Newton's law of universal gravitation. The frequencies of the phrases are used as the mass of the objects, and the distance is calculated as the cosine distance between the embeddings of the two phrases.</p><formula xml:id="formula_3">attr(𝑣 𝑖 , 𝑣 𝑗 ) = freq(𝑣 𝑖 ) × freq(𝑣 𝑗 ) 𝑑(𝑣 𝑖 , 𝑣 𝑗 ) 2<label>(2)</label></formula><p>where 𝑑(𝑣 𝑖 , 𝑣 𝑗 ) is the cosine between the embeddings of phrases 𝑣 𝑖 and 𝑣 𝑗 .</p><p>The weight of an edge is then calculated combining the attraction force and the dice coefficient: We already have embeddings for phrases defined in SLT. Here we talk about phrases from 𝑉 (or unseen) that do not occur in any relation to 𝐸.</p><formula xml:id="formula_4">𝑤 𝑖𝑗 = attr(𝑣 𝑖 , 𝑣 𝑗 ) × dice(𝑣 𝑖 , 𝑣 𝑗 )<label>(3)</label></formula><p>To extract keywords from the keywords of a graph, we will make use of the weighted PageRank algorithm. The PageRank algorithm is an iterative algorithm that calculates a score for each node of the graph, with a higher score indicating higher suitability as a keyphrase. The weighted PageRank algorithm ranks a node according to the rank of the sum of all its adjacent nodes, as well as the weights that connect them.</p><p>Then, the PageRank score is calculated, for each node of the graph recursively. The score at a given time step is calculated as:</p><formula xml:id="formula_5">𝑃 𝑡 (𝑣 𝑖 ) = (1 − 𝑑) + 𝑑 × ∑ 𝑣 𝑗 ∈𝐶(𝑣 𝑗 ) 𝑤 𝑖𝑗 ∑ 𝑣 𝑘 ∈𝐶(𝑣 𝑗 ) 𝑤 𝑗𝑘 𝑃 𝑡−1 (𝑣 𝑗 )<label>(4)</label></formula><p>where 𝑃 𝑡 (𝑣 𝑖 ) is the PageRank score for the node 𝑣 𝑖 at time 𝑡, 𝐶(𝑣 𝑖 ) is the set of edges adjacent to node 𝑣 𝑖 , 𝑑 is the dumping factor.</p><p>The results obtained from the PageRank algorithm can then be used to determine the most likely keyphrase candidates, with a higher score representing a more suitable keyphrase.</p><p>The issue with using the weighted PageRank algorithm on its own is that it works only with a given document, which makes it useful in extracting keyphrases that describe the text itself, but not what differentiates it from other texts. Since the texts are judicial decisions, many court-centric phrases would hinder our ability to differentiate court decisions by topic. Therefore the score for each phrase we obtained from the weighted PageRank was multiplied by its IDF score, calculated from all available court decisions as described by the TF-IDF metric. Multiplying the PageRank score by the IDF should favor keywords that are not as frequent and would therefore probably not be court-centric and thus more relevant to the specific topic of that decision.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Autoencoders</head><p>Keyword extraction methods like TF-IDF penalize phrases that are frequent in many documents, but infrequent phrases are not necessarily semantically informative. The task of removing court-centric phrases would be better achieved by using some form of semantic comparison. Phrases that are semantically dissimilar to the meaning of the majority of phrases are more likely to be keyphrases that can be used to meaningfully cluster documents. To perform semantic comparisons, we can combine our phrase embeddings with the autoencoder method.</p><p>Autoencoders, described in detail in <ref type="bibr" target="#b22">[24]</ref> are unsupervised neural networks that aim to create a representation Figure <ref type="figure">1</ref>: The scheme of autoencoder of data that selects only the most relevant parameters, which can be used to reconstruct the original data. Autoencoders consist of two main parts: the encoder, which converts the input into an encoding (usually of lesser dimensionality than the input), and a decoder that tries to reconstruct the input from the encoding (Fig. <ref type="figure">1</ref>). Using simple feedforward neural networks, the encoding ℎ be calculated as:</p><formula xml:id="formula_6">ℎ = 𝜔(𝑊 𝑥 + 𝑏)<label>(5)</label></formula><p>where 𝑥 is the input, 𝜔 is the element-wise activation function, 𝑊 is a weight matrix and 𝑏 is the bias. This encoding can then be used to obtain 𝑥 ′ , the reconstruction of the input. The reconstruction is calculated as:</p><formula xml:id="formula_7">𝑥 ′ = 𝜔 ′ (𝑊 ′ ℎ + 𝑏 ′ )<label>(6)</label></formula><p>where 𝜔 ′ , 𝑊 ′ and 𝑏 ′ might be different from 𝜔, 𝑊 and 𝑏.</p><p>We have trained our autoencoder to reconstruct the embeddings of phrases of the vocabulary 𝑉, described in 3.1.2. 3 Due to the vocabulary being made up primarily of phrases relevant to court decisions, we can infer that the reconstruction performance will be better with phrases explicitly related to court decisions. However, these phrases are detrimental to topic-based differentiation. Therefore by penalizing a high reconstruction success of a keyphrase, we can filter out those that are not relevant to the topic of that court decision. In our case, we multiplied the TF-IDF score of keyphrases with the cosine distance between the input embedding and the reconstructed embedding from the autoencoder: score(𝑣 𝑖 ) = tf-idf(𝑣 𝑖 ) * cos(emb(𝑣 𝑖 ), rec(emb(𝑣 𝑖 ))) (7) 3 Link to lemmatized court decisions. https://bit.ly/3zUwbYA</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Evaluation</head><p>We have implemented two algorithms to serve as our baseline. The first is the regular TF-IDF metric used for keyword extraction, using all available court decisions to calculate the IDF value. This method is corpusdependent, so other documents are taken into account. The second is the YAKE! algorithm <ref type="bibr" target="#b13">[15]</ref>, which takes into account only the current document. The algorithm described in 3.2 combines weighted PageRank with our phrase embeddings and multiplies the result by the IDF score of the TF-IDF metric. We will refer to this algorithm as WPR. The algorithm that multiplies regular TF-IDF score with cosine distance between and the algorithm described in 3.3 we labelled as AE.</p><p>Since we did not have access to extracted keyphrases of any court decisions, we have chosen five random court decisions for manual and expert evaluation. We have asked a legal expert to evaluate results in three ways:</p><p>• creation of abstracts that offer a brief summary of the content of the decisions (see figures 1 and 3), • manual extraction of keyphrases from the decisions using dictionary of keyphrases used by the analytical department of the Supreme Court (see figures 1 and 3), • the expert's opinion on the potential of the computed keyphrases to be included in the dictionary or to be used in any other way (see section 4.1).</p><p>We summarized the outputs of the algorithms into tables 2 and 4, where the rows are documents and the columns are algorithms. Each table cell consists of the top five keyphrases found by the given algorithm for the given document.</p><p>We have compared the computed key phrases with abstracts and manually extracted keyphrases. The phrases that are present in the abstract are highlighted in yellow. If the keyphrase matches the manually extracted keyphrase, it is highlighted by a black frame.</p><p>As we can see, the YAKE! algorithm provides many keyphrases that cannot be found in abstracts or manual keyphrases. This is due to the chosen keyphrases being too long and heavily related to the topic of judicial decisions that offer little in phrases of differentiating decisions from one another since the method is corpusindependent.</p><p>The weighted WPR algorithm multiplied by the IDF score performs quite a bit better, achieving good performance on documents 3 and 5, but is outclassed by the algorithms using TF-IDF as the basis of selection. This is likely because the WPR algorithm prefers phrases that are frequent and that are semantically similar to the other keyphrases, which is a good approach for general keyphrase extraction; however those might not be well suited to clustering within a corpus.</p><p>TF-IDF on its own achieves good performance, as the metric is built for extracting phrases that are good unique descriptors of documents. It brings many matches on all of the documents, with the top five keyphrases being good topic descriptors for all documents.</p><p>The most abstract and manual keyphrase matches were achieved by the AE algorithm, combining TF-IDF with the reconstruction error of the autoencoder.</p><p>An interesting finding of all evaluated methods is that the resulting phrases are found mainly in abstracts and less among manually obtained phrases. We would also like to point out that several manually extracted phrases are not even in the abstracts themselves.</p><p>We asked a legal expert to weigh in on the results from her perspective. We present her statement in full in the next section.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Legal expert statement</head><p>The keyphrases selected by the analysis define the nature of the respective judicial decisions to varying degrees. In some cases, the selected keyphrases sufficiently characterize the decisions, e. g. as regards the second decision where it is clear that the decision regards the cancellation of the child support obligation. In other cases, the keyphrases extracted from the decisions' text describe the factual circumstances of the case rather than the relevant legal institutes applied in them or the legal process as such. To illustrate, the keyphrases describing the first decision focus on the factual background of the case, namely the asserting of warranty ("refund") for the services provided ("to train"), but do not specifically define the applicable legal institute (liability for defects), or the type of contract concluded between the parties to a dispute (framework agreement on cooperation), which would be most likely the keyphrases used by the legal expert to search for decisions in analogous cases. Similarly, it is unclear from the keyphrases characterizing other decisions examined what type of a decision is adopted (decision on the merits of the case or a procedural decision). To demonstrate, it is not apparent that the third decision regards the appellant's court reversal and referral of the decision of the court of the first instance, that in the fourth decision, the court discontinued the execution of a judgment or that the fifth decision approves the agreement on guilt and punishment (although in this case the phrase "approve the agreement" has been selected). This is, however, understandable, as these are all legal categories that may not be immediately identifiable from the decisions' text alone without previous legal input.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Summary and future work</head><p>This paper proposes and evaluates unsupervised keyword extraction methods because we lack labeled data as a proof of concept. We can conclude from the statement of a legal expert that the most relevant keyphrases are legal institutes and legal processes.</p><p>In our new project, we plan to cooperate with the Supreme Court of the Slovak Republic, in which we should be able to work with manually extracted phrases from their court decisions. This cooperation will allow us to design and test supervised keyword extraction methods and compare them with the methods presented in this paper. In our future work, we want to include laws and regulations cited by court decisions as a source of names of legal institutes.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusion</head><p>In the article, we studied the problem of revealing keyphrases in the court decisions of the Slovak Republic. We proposed two unsupervised algorithms and evaluated them on five arbitrary court decisions. We have compared computed keyphrases with expert-written abstracts and manually extracted keyphrases. The results show that the methods extract keyphrases that are mainly included in abstracts rather than manually extracted keyphrases. The best results proposed the AE algorithm, combining TF-IDF with the reconstruction error of the autoencoder.</p><p>We believe that the results of the algorithms can be used as recommendations for manual annotation of court decisions with keyphrases if the intersection of found keyphrases with a dictionary of legal phrases is applied. It can also be used to enrich search results and expand filtering options.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Abstracts from court decisions and manually extracted keyphrases by legal expert translated to English.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>No.</head><p>Abstract Manually extracted keyphrases 1</p><p>The complainant (lector) demanded via judicial proceedings that the defendant pays the full price of the in-voice for the services provided (realization of professional training). The defendant, who was the complainant's customer, paid the invoice only in part (liability for delay) due to considering the services provided by the complainant to be of poor quality (liability for defects). The defendant has also demanded a refund.</p><p>contract, liability for defects, liability, default, client, innominate contract, warranty, service, action 2</p><p>The complainant demanded the court to cancel the duty to support and maintain against the two defendants, who graduated from high school, are legal adults who are able to earn a living wage. The defendants agreed with the cancellation of the duty to support and maintain.</p><p>alimony, duty to support and maintain 3</p><p>The complainant applied a bill of exchange against the defendant, which was rejected by the district court.The reasoning of rejection was the fact that the district court called for the complainant to fill in additional data in to the proposal form , which the complainant did not do. The court of appeals ruled in favour of the complainant,affirming that he did not need to fill in his proposal with additional data. The first instance court arrived at the decision by applying incorrect legislation and incorrect interpretation of the legislation and EU rights.</p><p>bill of exchange, claim, commercial paper, appeal, referral, reversing decision 4</p><p>The court rejected the proposal of granting authorization to a court distrainor and stopped all distraint proceedings. The court didn't assign the distraint expenses to the court distrainor.</p><p>discontinue distraint, distraint proceedings, distraint, court distrainor 5</p><p>The accused was neglmigently driving a motor vehicle, not paying attention to the traffic situation on the road and did not give way to a crossing pedestrian. A collision occured, where the pedestrian suffered injuries consisting of multiple bone fractures and internal bleeding. The accused inflicted grievous bodily harm to the pedestrian due to negligence, due to which the accused was charged with inflicting injury. The accused was received a fine had their driving license revoked from all types of motor vehicles and she entered a plea agreement.</p><p>bodily harm, agreement on guilt and punishment, negligence, punishment, criminal offence, punishment by disqualification  </p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>2</head><label>2</label><figDesc></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 2</head><label>2</label><figDesc>Top 5 keyphrases translated to English language.</figDesc><table><row><cell>No.</cell><cell>TF-IDF</cell><cell>YAKE!</cell><cell>WPR</cell><cell>AE</cell></row><row><cell></cell><cell>to train</cell><cell>according to the PRINCE methodology</cell><cell>to train</cell><cell>to train</cell></row><row><cell></cell><cell>customer</cell><cell>between the participants of the proceedings</cell><cell>lector</cell><cell>customer</cell></row><row><cell>1</cell><cell>lector</cell><cell>PRINCE methodology training</cell><cell>trainer</cell><cell>lector</cell></row><row><cell></cell><cell>project</cell><cell>according to the commercial law section</cell><cell>accreditation</cell><cell>email</cell></row><row><cell></cell><cell>studies</cell><cell>participants of the proceedings was</cell><cell>studies</cell><cell>refund</cell></row><row><cell></cell><cell>studies</cell><cell>district court of Námestovo</cell><cell>loader</cell><cell>duty to support and maintain</cell></row><row><cell></cell><cell>duty to support and maintain</cell><cell>by the judgment of the district court</cell><cell>high school</cell><cell>court of Námestovo</cell></row><row><cell>2</cell><cell>support and maintain to work court of Námestovo</cell><cell>on the basis of an employment contract to support according to the paragraph he finished high school studies</cell><cell>worker to take care of part-time job</cell><cell>cancel the duty to support and maintain contract of employment obligation towards</cell></row><row><cell>3</cell><cell>bill of exchange form first instance court</cell><cell>low value of the dispute to apply the claim of the court to apply the claim</cell><cell>assumption receiving bill of exchange</cell><cell>bill of exchange the first instance court to apply the claim</cell></row><row><cell></cell><cell>first instance fill out</cell><cell>the first instance court in connection to the court of appeals</cell><cell>form stage</cell><cell>form of application owner of the bill of exchange</cell></row><row><cell></cell><cell>court distrainor</cell><cell>court of Dolný Kubín</cell><cell>Dolný Kubín</cell><cell>court distrainor</cell></row><row><cell>4</cell><cell>Dolný Kubín Dolný to grant authorization to grant</cell><cell>first instance court district court of Dolný Kubín apartment Dolný Kubín Dolný Kubín case reference</cell><cell>court distrainor Dolný to apply to instruct case reference</cell><cell>Dolný Kubín to grant a warrant court court expenses of distraint</cell></row><row><cell>5</cell><cell>penalty guilt bone to charge fracture</cell><cell>by paragraph paragraph paragraph paragraph letter health by paragraph months by paragraphs Euro by paragraph</cell><cell>pedestrian pedestrian crossing shovel bone lane</cell><cell>road traffic fracture bone penalty approve the agreement</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 3</head><label>3</label><figDesc>Abstracts from court decisions and manually extracted keyphrases by legal expert in Slovak. Navrhovateľ si v návrhu uplatnil voči odporcovi pohľadávku, ktorú mu okresný súd zamietol. Dôvodom zamietnutia bol ten, že okresný súd vyzval navrhovateľa o doplnenie údajov prostredníctvom tlačiva na doplnenie návrhu, ktoré navrhovateľ nedoplnil. Odvolací súd dal navrhovateľovi za pravdu, teda že navrhovateľ nemusel dopĺňať svoj návrh o ďalšie údaje. Prvostupňový súd dospel k rozhodnutiu na základe aplikácie nesprávnych právnych predpisov a nesprávnej interpretácie príslušných právnych predpisov a práva EÚ. zmenka, pohľadávka, cenné papiere, odvolanie, vrátenie veci, zrušujúce rozhodnutie 4 Súd zamietol žiadosť o udelenie poverenia pre súdnu exekútorku a zastavil exekučné konanie. Súd exekútorke trovy exekúcie neprisúdil. zastavenie exekúcie, exekučné konanie, exekúcia, exekútor 5 Obvinená viedla motorové vozidlo a nevenovala plnú pozornosť vedeniu vozidla. Nesledovala situáciu v cestnej premávke a nedala prednosť chodcovi prechádzajúceho cez priechod pre chodcov. Došlo k zrážke, pričom chodec utrpel poranenia pozostávajúce zo zlomením viacerých kostí a vnútorných krvácaní. Z nedbanlivosti spôsobila ťažkú ujmu na zdraví chodcovi, čím spáchala prečin ublíženia na zdraví. Obvinená dostala peňažný trest a trest zákazu činnosti viesť všetky druhy motorových vozidiel, pričom uzavrela dohodu o vine a treste. ujma na zdraví, dohoda o vine a treste, nedbanlivosť, trest, trestný čin, trest zákazu činnosti</figDesc><table><row><cell>No.</cell><cell>Abstract</cell><cell>Manually extracted keyphrases</cell></row><row><cell>1</cell><cell>Navrhovateľ (lektor) sa súdnym konaním domáhal, aby odporca uhradil faktúru za</cell><cell>zmluva, zodpovednosť za vady, zod-</cell></row><row><cell></cell><cell>poskytnuté služby (realizácia odborných školení) v plnej výške. Odporca, ktorý bol</cell><cell>povednosť, omeškanie, objednávateľ,</cell></row><row><cell></cell><cell>zákazníkom navrhovateľa, uhradil faktúru iba čiastočne (zodpovednosť za omeškanie)</cell><cell>nepomenovaná zmluva, reklamácia,</cell></row><row><cell></cell><cell>kvôli tomu, že navrhovateľ podľa neho poskytol vadné služby (zodpovednosť za vady).</cell><cell>služba, žaloba</cell></row><row><cell></cell><cell>Navrhovateľ taktiež podal reklamáciu.</cell><cell></cell></row><row><cell>2</cell><cell>Navrhovateľka žiadala, aby súd zrušil jej vyživovaciu povinnosť voči dvom odporcom, ktorí</cell><cell>výživné, vyživovacia povinnosť</cell></row><row><cell></cell><cell>ukončili stredoškolské štúdium, sú plnoletí a zarábajú si sami na živobytie. Odporcovia</cell><cell></cell></row><row><cell></cell><cell>súhlasili so zrušením vyživovacej povinnosti.</cell><cell></cell></row><row><cell>3</cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 4</head><label>4</label><figDesc>Top 5 keyphrases in Slovak language.</figDesc><table><row><cell>No.</cell><cell>TF-IDF</cell><cell>YAKE!</cell><cell>WPR</cell><cell>AE</cell></row><row><cell></cell><cell>školiť</cell><cell>podľa metodiky PRINCE</cell><cell>školiť</cell><cell>školiť</cell></row><row><cell></cell><cell>zákazník</cell><cell>medzi účastníkmi konania</cell><cell>lektor</cell><cell>zákazník</cell></row><row><cell>1</cell><cell>lektor</cell><cell>školenia metodiky PRINCE</cell><cell>školiteľ</cell><cell>lektor</cell></row><row><cell></cell><cell>projekt</cell><cell>podľa ods obchodného</cell><cell>akreditácia</cell><cell>email</cell></row><row><cell></cell><cell>štúdium</cell><cell>účastníkmi konania bola</cell><cell>štúdia</cell><cell>reklamácia</cell></row><row><cell></cell><cell>štúdium</cell><cell>okresného súdu námestovo</cell><cell>nakladač</cell><cell>vyživovacia povinnosť</cell></row><row><cell></cell><cell>vyživovacia povinnosť</cell><cell>rozsudkom okresného súdu</cell><cell>stredoškolský</cell><cell>súd námestovo</cell></row><row><cell>2</cell><cell>vyživovací pracovať súd Námestovo</cell><cell>základe pracovnej zmluvy živiť podľa ods ukončil stredoškolské štúdium</cell><cell>robotník opatrovať brigáda</cell><cell>zrušiť vyživovaciu povinnosť pracovná zmluva povinnosť voči</cell></row><row><cell>3</cell><cell>zmenka tlačivo prvostupňový súd</cell><cell>nízkou hodnotou sporu uplatnenie pohľadávky súdu uplatnenie pohľadávky</cell><cell>dohad prijímací zmenka</cell><cell>zmenka prvostupňový súd uplatniť pohľadávku</cell></row><row><cell></cell><cell>prvostupňový vyplniť</cell><cell>prvostupňový súd súvislosti odvolací súd</cell><cell>tlačivo etapa</cell><cell>tlačivo návrh majiteľ zmenky</cell></row><row><cell></cell><cell>súdna exekútorka</cell><cell>súd Dolný Kubín</cell><cell>Dolný Kubín</cell><cell>súdna exekútorka</cell></row><row><cell>4</cell><cell>Dolný Kubín dolný udelenie poverenia udelenie</cell><cell>súd prvého stupňa okresný súd dolný bytom Dolný Kubín dolný kubín spisová</cell><cell>súdna exekútorka Dolný uplatniť poučiť spisová značka</cell><cell>dolný kubín udelenie poverenia súd súdny trovy exekúcie</cell></row><row><cell>5</cell><cell>trest vina kosť obviniť zlomenina</cell><cell>podľa ods ods ods ods písm zdraví podľa ods mesiacov podľa ods eur podľa ods</cell><cell>chodec priechod lopata kosť pruh</cell><cell>cestná premávka zlomenina kosť trest schváliť dohodu</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">Vocabulary 𝑉 does not contain stop words.</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgement</head><p>This work was supported by the Slovak Research and Development Agency under contract No. APVV-21-0336 Analysis of court decisions by methods of artificial intelligence. This work was supported by the Scientific Grant Agency of the Ministry of Education, Science, Research and Sport of the Slovak Republic under contract VEGA 1/0177/21 Descriptive and computational complexity of automata and algorithms. This work was supported by the internal project at the Faculty of Science at Pavol Jozef Šafárik University in Košice vvgs-pf-2021-1789 Legal text analysis using computer linguistics.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">A review of keyphrase extraction</title>
		<author>
			<persName><forename type="first">E</forename><surname>Papagiannopoulou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Tsoumakas</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page">e1339</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Using machine learning to predict decisions of the european court of human rights</title>
		<author>
			<persName><forename type="first">M</forename><surname>Medvedeva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Vols</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wieling</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Artificial Intelligence and Law</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="page" from="237" to="266" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Predicting judicial decisions of the european court of human rights: A natural language processing perspective</title>
		<author>
			<persName><forename type="first">N</forename><surname>Aletras</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Tsarapatsanis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Preoţiuc-Pietro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Lampos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">PeerJ Computer Science</title>
		<imprint>
			<biblScope unit="volume">2</biblScope>
			<biblScope unit="page">e93</biblScope>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Efficient prediction of court judgments using an lstm+ cnn neural network model with an optimal feature set</title>
		<author>
			<persName><forename type="first">D</forename><surname>Alghazzawi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Bamasag</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Albeshri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sana</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ullah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">Z</forename><surname>Asghar</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Mathematics</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page">683</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<author>
			<persName><forename type="first">D</forename><surname>Varga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Szoplák</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Krajci</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Sokol</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Gurskỳ</surname></persName>
		</author>
		<title level="m">Analysis and prediction of legal judgements in the slovak criminal</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">VICTOR: a dataset for Brazilian legal documents classification</title>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">H</forename><surname>Luz De Araujo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">E</forename><surname>De Campos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Braz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Correia Da Silva</surname></persName>
		</author>
		<ptr target="https://www.aclweb.org/anthology/2020.lrec-1.181" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 12th Language Resources and Evaluation Conference, European Language Resources Association</title>
				<meeting>the 12th Language Resources and Evaluation Conference, European Language Resources Association<address><addrLine>Marseille, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="1449" to="1458" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<monogr>
		<author>
			<persName><forename type="first">I</forename><surname>Chalkidis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jana</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Hartung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bommarito</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Katz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Aletras</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2110.00976</idno>
		<title level="m">LexGLUE: A benchmark dataset for legal language understanding in english</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<ptr target="http://falm.info/" />
		<title level="m">The Free Access to Law Movement (FALM)</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Text summarization from legal documents: a survey</title>
		<author>
			<persName><forename type="first">A</forename><surname>Kanapala</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Pal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Pamula</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Artificial Intelligence Review</title>
		<imprint>
			<biblScope unit="volume">51</biblScope>
			<biblScope unit="page" from="371" to="402" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Distributional structure</title>
		<author>
			<persName><forename type="first">Z</forename><forename type="middle">S</forename><surname>Harris</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Word</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="page" from="146" to="162" />
			<date type="published" when="1954">1954</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">A statistical interpretation of term specificity and its application in retrieval</title>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">S</forename><surname>Jones</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of documentation</title>
		<imprint>
			<date type="published" when="1972">1972</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Kp-miner: A keyphrase extraction system for english and arabic documents</title>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">R</forename><surname>El-Beltagy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rafea</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Information systems</title>
		<imprint>
			<biblScope unit="volume">34</biblScope>
			<biblScope unit="page" from="132" to="144" />
			<date type="published" when="2009">2009</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Clustering to find exemplar terms for keyphrase extraction</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Sun</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2009 conference on empirical methods in natural language processing</title>
				<meeting>the 2009 conference on empirical methods in natural language processing</meeting>
		<imprint>
			<date type="published" when="2009">2009</date>
			<biblScope unit="page" from="257" to="266" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Yake! keyword extraction from single documents using multiple local features</title>
		<author>
			<persName><forename type="first">R</forename><surname>Campos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Mangaravite</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pasquali</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jorge</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Nunes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Jatowt</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Information Sciences</title>
		<imprint>
			<biblScope unit="volume">509</biblScope>
			<biblScope unit="page" from="257" to="289" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Textrank: Bringing order into text</title>
		<author>
			<persName><forename type="first">R</forename><surname>Mihalcea</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Tarau</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2004 conference on empirical methods in natural language processing</title>
				<meeting>the 2004 conference on empirical methods in natural language processing</meeting>
		<imprint>
			<date type="published" when="2004">2004</date>
			<biblScope unit="page" from="404" to="411" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">The anatomy of a large-scale hypertextual web search engine</title>
		<author>
			<persName><forename type="first">S</forename><surname>Brin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Page</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Computer networks and ISDN systems</title>
		<imprint>
			<biblScope unit="volume">30</biblScope>
			<biblScope unit="page" from="107" to="117" />
			<date type="published" when="1998">1998</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Automatic keyword extraction from individual documents</title>
		<author>
			<persName><forename type="first">S</forename><surname>Rose</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Engel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Cramer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Cowley</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Text mining: applications and theory</title>
		<imprint>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="10" to="1002" />
			<date type="published" when="2010">2010</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<ptr target="https://www.slov-lex.sk/zoznam-tezaurov" />
		<title level="m">Legislative and information portal, Ministry of Justice of the Slovak Republic</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note>Slovak law thesaurus</note>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Graph embedding techniques, applications, and performance: A survey</title>
		<author>
			<persName><forename type="first">P</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Ferrara</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Knowl. Based Syst</title>
		<imprint>
			<biblScope unit="volume">151</biblScope>
			<biblScope unit="page" from="78" to="94" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">node2vec: Scalable feature learning for networks</title>
		<author>
			<persName><forename type="first">A</forename><surname>Grover</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Leskovec</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</title>
				<meeting>the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">Semantic representation of slovak words</title>
		<author>
			<persName><forename type="first">S</forename><surname>Horvát</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Krajči</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Antoni</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="s">CEUR Workshop Proceedings</title>
		<imprint>
			<biblScope unit="volume">2718</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Corpus-independent generic keyphrase extraction using word embedding vectors</title>
		<author>
			<persName><forename type="first">R</forename><surname>Wang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Software engineering research conference</title>
		<imprint>
			<biblScope unit="volume">39</biblScope>
			<date type="published" when="2014">2014</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title/>
		<author>
			<persName><forename type="first">D</forename><surname>Bank</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Koenigstein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Giryes</surname></persName>
		</author>
		<idno>CoRR abs/2003.05991</idno>
		<ptr target="https://arxiv.org/abs/2003.05991.arXiv:2003.05991" />
	</analytic>
	<monogr>
		<title level="j">Autoencoders</title>
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
