<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">HCMUS at MediaEval 2020: Image-Text Fusion for Automatic News-Images Re-Matching</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Thuc</forename><surname>Nguyen-Quang</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Tuan-Duy</forename><forename type="middle">H</forename><surname>Nguyen</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Thang-Long</forename><surname>Nguyen-Ho</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Anh-Kiet</forename><surname>Duong</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Nhat</forename><surname>Hoang-Xuan</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Vinh-Thuyen</forename><surname>Nguyen-Truong</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Hai-Dang</forename><surname>Nguyen</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Minh-Triet</forename><surname>Tran</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution" key="instit1">University of Science</orgName>
								<orgName type="institution" key="instit2">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">John von Neumann Institute</orgName>
								<orgName type="institution">VNU-HCM</orgName>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh city</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">HCMUS at MediaEval 2020: Image-Text Fusion for Automatic News-Images Re-Matching</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">5EE52B0781D19BF481CCE0E08F0F4EF6</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-24T07:11+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Matching text and images based on their semantics has an important role in cross-media retrieval. Especially, in terms of news, text and images connection is highly ambiguous. In the context of MediaEval 2020 Challenge, we propose three multi-modal methods for mapping text and images of news articles to the shared space in order to perform efficient cross-retrieval. Our methods show systemic improvement and validate our hypotheses, while the best-performed method reaches a recall@100 score of 0.2064.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">INTRODUCTION</head><p>News articles represent a complex class of multimedia, whose textual content and accompanying images might not be explicitly related <ref type="bibr" target="#b22">[25]</ref>. Existing research in multimedia and recommendation system domains mostly investigate image-text pairs with simple relationships, e.g., image captions that literally describe components of the images <ref type="bibr" target="#b13">[16]</ref>. To address this, the MediaEval 2020 NewsImages Task calls for researchers to investigate the real-world relationship of news text and images in more depth, in order to understand its implications for journalism and news recommendation systems <ref type="bibr" target="#b16">[19]</ref>.</p><p>Our team at HCMUS responds to this call by addressing the Image-Text Re-Matching task. Particularly, given a set of image-text pairs in the wild, the task requires us to correctly re-assign images to their decoupled articles, with the aim to understand the implication of journalism in choosing illustrative images.</p><p>Our methods mainly concern fusing cross-modal embeddings for automatic matching. We experimented with a range of embedded information, including simple set intersection, deep neural features, and knowledge-graph-enhanced neural features. We combine such features in various ways for various experiments. Finally, we obtain our best result with the ensemble of experimented methods.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">METHODS 2.1 Metric Learning</head><p>The primary idea of this baseline method is using metric learning to project embeddings of image-text pairs to bases of significant similarity. Particularly, we use two approaches to embed image features: global context embedding and local context embedding. In the first approach, we use the EfficientNet <ref type="bibr" target="#b27">[30]</ref>, a SOTA classification architecture, to extract features of the image before taking the flatten output features. Our motivation in the latter approach is to harness critical local information from the extracted global context. Thus, we use the bottom-up-attention model <ref type="bibr" target="#b0">[3]</ref> to extract the top-𝑘 objects based on their confidence score, before passing them over to a self-attention sequential model. For both routines, we employ BERT <ref type="bibr" target="#b9">[12]</ref> language model to embed textual content, then project the textual and image embeddings to the same dimension. Finally, we train our Triplet Loss <ref type="bibr" target="#b12">[15]</ref> model with positive and negative pairs from a hard sample miner.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Image-Text Matching via Categorization</head><p>In this method, we train two gradient boosting decision trees <ref type="bibr" target="#b15">[18]</ref>, one for categorizing images, and the other for categorizing articles. The target categories are ['nrw', 'kultur', 'region', 'panorama', 'sport', 'wirtschaft', 'koeln', 'ratgeber', 'politik', 'unknown'], which are deduced from URLs in the train set.</p><p>We use features extracted for images and text to train the decision tree. To augment the data, we use VGG16, InceptionResNetV2, Mo-bileNetV2, EfficientNetB1-7, Xception, ResNet152V2, NASNetLarge, DenseNet201 <ref type="bibr" target="#b7">[10,</ref><ref type="bibr" target="#b11">14,</ref><ref type="bibr" target="#b14">17,</ref><ref type="bibr" target="#b24">[27]</ref><ref type="bibr" target="#b25">[28]</ref><ref type="bibr" target="#b26">[29]</ref><ref type="bibr" target="#b27">[30]</ref><ref type="bibr" target="#b29">32]</ref> for images, while using pretrained BERT models[2, 8, 9, 11], and pretrained ELECTRA models <ref type="bibr">[1,</ref><ref type="bibr" target="#b6">9]</ref> to extract contextual features.</p><p>We presume that images and articles of the same category might have some relations. Moreover, the rank of matching categories also affects ranking. For example, an image-text pair sharing a 3rd-ranked category might be less relevant than the pair sharing a 1st-ranked category. Hence, instead of using Jaccard similarity, we propose an iterative ranking method that takes into account the order of matched categories. At the 𝑘-th iteration, our method first finds top-𝑘 categories for each image and top-𝑘 categories for each article. Then for each article, we create a list of candidate images whose top-𝑘 categories intersect that of the article. This list of candidates at the 𝑘-th iteration is concatenated to the final list. Finally, the remaining images that are not candidates are kept in their order and concatenated to the end of the final list.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3">Graph-based Face-Name Matching</head><p>Based on our observation, in a lot of instances, the publisher uses a portrait of somebody mentioned in the text. We build the face-name graph to represent the relation between the name and the face.</p><p>Person name extraction: To automatically extract people's name from the text, we use entity-fishing <ref type="bibr" target="#b20">[23]</ref> -an open-source highperformance entity recognition and disambiguation tool. It relies on Random Forest and Gradient Tree Boosting to recognize named entities, in our case people's names, and link them against Wikidata entities using their word embeddings and Wikidata entities' embeddings.</p><p>Face encoding: We use face recognition open-source library <ref type="bibr" target="#b10">[13]</ref> to detect and represent the face as 128-dims vector. The tool uses a pre-trained model from the dlib-models repository <ref type="bibr" target="#b17">[20]</ref> and chooses ResNet as the backbone for face feature extraction.</p><p>Using the train set, we connect each person mentioned in the articles with features extracted from accompanying faces. During testing, we encode the face from the image and aggregate the number of matched faces connected to the people mentioned in the text. Two faces are matched if 𝐿2-distance between two vectors less than 0.6. The ranking of images is sorted by the total matched. T. Nguyen-Quang et al. Based on the hypothesis that the description of the image is semantically similar to the title, we build an image captioning model which is inspired by the tutorial Image captioning with visual attention <ref type="bibr" target="#b28">[31]</ref>.</p><p>The model has three main parts:</p><p>• Image feature extractor: We use EfficientNet <ref type="bibr" target="#b27">[30]</ref> for feature extraction. The feature has the shape (8, 8, 2048) • Feature encoder: The features pass through fully connected giving a vector 256-dims. • Decoder: To generate the caption, we use Bahdanau attention <ref type="bibr" target="#b1">[4]</ref> and GRU to predict the next word. We merge the train set with Flickr and COCO for training. We use fuzzywuzzy ratio and partial ratio string matching to compare captions and articles title. To represent the caption and the title as a vector, we use RoBERTa and doc2vec <ref type="bibr" target="#b19">[22]</ref> enwiki_dbow, apnews_dbow. Then, we calculate the similarity of two vectors by cosine similarity. The final score is calculated by: 𝑆 total =𝑆 wiki +𝑆 apnews +𝑆 RoBERTa + (1−𝐷 fuzzy ) + (1−𝐷 partial ) where 𝑆 wiki , 𝑆 apnews , 𝑆 RoBERTa are cosine similarity of two vectors generated by enwiki_dbow, apnews_dbow, RoBERTa, and 𝐷 𝑓 𝑢𝑧𝑧𝑦 , 𝐷 𝑝𝑎𝑟𝑡𝑖𝑎𝑙 are fuzzywuzzy and partial ratios, respectively.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.5">Image-Text Fusion with Knowledge Graph-based Contextual Embeddings</head><p>We observe that image-text pairs may not have any explicit relationships. Yet, such text-image pairs could still remotely related through layers of abstraction. For example, an article about violence could feature a stock photo of a gun barrel. Although such a stock photo does not literally illustrate the textual content, we understand that a gun conveys a sense of threat, which, in turn, is related to violence. Thus, we consider exploiting knowledge graphs. On a knowledge graph, such as BabelNet <ref type="bibr" target="#b21">[24]</ref>, the concept node of gun is also remotely connected with violence through intermediate nodes. Thus, we hypothesize that the projection of the textual and imagery content of a news article onto a knowledge graph would be connected, and their embeddings, in turns, could be in close proximity.</p><p>To implement this projection, we use EWISER word sense disambiguator <ref type="bibr" target="#b3">[6]</ref> to link textual entities from texts to their synsets in the WordNet subset of BabelNet. Then, the mean of accompanied SenSemBERT+LMMS embeddings corresponds to these extracted synsets representing the texts. For the images, we first map images to the textual domain. To enhance the method by featuring abstract human-level concepts in the mapping, we decide to use TResNET-L with Asymmetric Loss (ASL) <ref type="bibr" target="#b2">[5,</ref><ref type="bibr" target="#b23">26]</ref> pre-trained on OpenImagesV6 <ref type="bibr" target="#b18">[21]</ref> to extract multi-label from images. Our decision is grounded since OpenImagesV6 features image-level labels conform with Freebase <ref type="bibr" target="#b4">[7]</ref> knowledge graph with figurative labels, e.g., festivals, sport, comedy, etc., while TResNET-L with ASL is the stateof-the-art method for OpenImagesV6 multi-label benchmark. The extracted lists of labels are also linked with synsets using EWISER, and the mean of these synset embedding vectors represent images.</p><p>We then train a canonical correlation analysis (CCA) module with the vector representation on the train set before using it to transform test set vectors. For relatedness measurement, for each test article, we rank all images in the test set using the 𝐿2-distance between the article vector and image vectors.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3">EXPERIMENTAL RESULTS</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Data preprocessing</head><p>The MediaEval 2020 Image-Text Re-Matching benchmark releases three batches of data in total consists of the lede and titles of German news articles and their accompanying images. The first two are used for training, and the last one is used for testing.</p><p>For the sake of manual assertion, we decide to translate all the text to English using Google Translate and employ this translated text in our experiments. All data batches are cleaned automatically, with images crawled using the given URLs and pairs with 404 Not Found URLs dropped from the train set.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Submissions</head><p>First, TripletLocal and TripletGlobal demonstrate respective methods in Section 2.1. In both submissions, we empirically choose 𝑘 = 30 to embed images with top-𝑘 objects, then sort candidate images for each article by the similarity of their embedding to that of the article.</p><p>The Group-Face&amp;Cap submission, meanwhile, combine three different methods. First, we matches image-article pairs using the method in Section 2.2 with 𝑘 = 5. However, at each iteration, we sort the candidates by 𝑆 𝑡𝑜𝑡𝑎𝑙 score mentioned in 2.4. Finally, candidate images matched with the article through the method in Section 2.3 are prioritized to the top of the final result.</p><p>The KG-Fusion submission manifest the method described in Section 2.5. Specifically, the TResNet-L with ASL model used for multilabel extraction accepts a sigmoid threshold of 0.7, the EWISER disambiguator consumes chunks of 5 tokens, and the target decomposition of the CCA module has 64 components.</p><p>Finally, the Ensemble submission combines all described methods, weighting each models based on their efficiency. As such, the final ranking of a candidate image is:</p><formula xml:id="formula_0">𝑅 Ensemble =𝑤 1 𝑅 Caption +𝑤 2 𝑅 Triplet +𝑤 3 𝑅 Face +𝑤 4 𝑅 KG−Fusion .</formula><p>where 𝑅 𝐸𝑛𝑠𝑒𝑚𝑏𝑙𝑒 , 𝑅 𝐶𝑎𝑝𝑡𝑖𝑜𝑛 , 𝑅 𝑇 𝑟𝑖𝑝𝑙𝑒𝑡 , 𝑅 𝐹𝑎𝑐𝑒 , 𝑅 𝐾𝐺−𝐹𝑢𝑠𝑖𝑜𝑛 are ranks of the image produced by Group-Face&amp;Cap, TripletGlobal, Face Matching, and KG-Fusion methods, respectively. Weighting factors are empirically chosen to be 𝑤 1 =𝑤 4 = 1, 𝑤 2 = 0.02 and 𝑤 3 = 0.25.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4">CONCLUSION AND FUTURE WORKS</head><p>Although, our methods show poor accuracy, they systematically increase the performance on the recall@100 metric. This fact validates our hypotheses that incorporating high-level semantics increase performance. Moreover, our methods yield consistent results, i.e., high-ranking images are of relevance to queried articles. Thus, they can still be useful for building news image recommendation systems as the news-images suitability is not injective in practice. The ensemble method's performance also suggests practical system builders to use multiple methods to handle different aspects of the complex image-text multimodal relation. In future works, we wish to investigate better fusion methods, consider a thorough ablation study for proposed methods, and enhance the dataset for thorough evaluation with information retrieval metrics like NDCG.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1 :</head><label>1</label><figDesc>Submission result</figDesc><table><row><cell>Method</cell><cell cols="3">Acc. Recall@100 MRR@100</cell></row><row><cell>TripletLocal</cell><cell>0.0000</cell><cell>0.0248</cell><cell>0.0012</cell></row><row><cell>TripletGlobal</cell><cell>0.0002</cell><cell>0.0238</cell><cell>0.0013</cell></row><row><cell cols="2">Group-Face&amp;Cap 0.0194</cell><cell>0.1322</cell><cell>0.0237</cell></row><row><cell>KG-Fusion</cell><cell>0.0051</cell><cell>0.1667</cell><cell>0.0164</cell></row><row><cell>Ensemble</cell><cell>0.0075</cell><cell>0.2064</cell><cell>0.0222</cell></row><row><cell cols="3">2.4 Image-Text Fusion with Image</cell><cell></cell></row><row><cell cols="4">Captioning and Contextual Embeddings</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Acknowledgments: Research is supported by Vingroup Innovation Foundation (VINIF) in project code VINIF.2019.DA19.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<monogr>
		<title level="m" type="main">Bottom-Up and Top-Down Attention for Image Captioning and VQA</title>
		<author>
			<persName><forename type="first">Peter</forename><surname>Anderson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Xiaodong</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Chris</forename><surname>Buehler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Damien</forename><surname>Teney</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Mark</forename><surname>Johnson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Stephen</forename><surname>Gould</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Lei</forename><surname>Zhang</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1707.07998</idno>
		<ptr target="http://arxiv.org/abs/1707.07998" />
		<imprint>
			<date type="published" when="2017">2017. 2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<title level="m" type="main">Neural Machine Translation by Jointly Learning to Align and Translate</title>
		<author>
			<persName><forename type="first">Dzmitry</forename><surname>Bahdanau</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Kyunghyun</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Yoshua</forename><surname>Bengio</surname></persName>
		</author>
		<idno>arXiv:cs.CL/1409.0473</idno>
		<imprint>
			<date type="published" when="2016">2016. 2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<title level="m" type="main">Asymmetric Loss For Multi-Label Classification</title>
		<author>
			<persName><forename type="first">Emanuel</forename><surname>Ben-Baruch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Tal</forename><surname>Ridnik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Nadav</forename><surname>Zamir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Asaf</forename><surname>Noy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Itamar</forename><surname>Friedman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Matan</forename><surname>Protter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Lihi</forename><surname>Zelnik-Manor</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2009.14119</idno>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Breaking through the 80% glass ceiling: Raising the state of the art in Word Sense Disambiguation by incorporating knowledge graph information</title>
		<author>
			<persName><forename type="first">Michele</forename><surname>Bevilacqua</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Roberto</forename><surname>Navigli</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
				<meeting>the 58th Annual Meeting of the Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="2854" to="2864" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Freebase: a collaboratively created graph database for structuring human knowledge</title>
		<author>
			<persName><forename type="first">Kurt</forename><surname>Bollacker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Colin</forename><surname>Evans</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Praveen</forename><surname>Paritosh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Tim</forename><surname>Sturge</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jamie</forename><surname>Taylor</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2008 ACM SIGMOD international conference on Management of data</title>
				<meeting>the 2008 ACM SIGMOD international conference on Management of data</meeting>
		<imprint>
			<date type="published" when="2008">2008</date>
			<biblScope unit="page" from="1247" to="1250" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<author>
			<persName><forename type="first">Malte</forename><surname>Pietsch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Tanay</forename><surname>Soni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Branden</forename><surname>Chan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Timo</forename><surname>Möller</surname></persName>
		</author>
		<ptr target="https://huggingface.co/bert-base-german-cased" />
		<title level="m">Model from</title>
				<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<monogr>
		<title level="m" type="main">German&apos;s Next Language Model</title>
		<author>
			<persName><forename type="first">Branden</forename><surname>Chan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Stefan</forename><surname>Schweter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Timo</forename><surname>Möller</surname></persName>
		</author>
		<idno>arXiv:cs.CL/2010.10906</idno>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Xception: Deep learning with depthwise separable convolutions</title>
		<author>
			<persName><forename type="first">François</forename><surname>Chollet</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2017">2017</date>
			<biblScope unit="page" from="1251" to="1258" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<ptr target="https://huggingface.co/dbmdz/bert-base-german-uncased" />
		<title level="m">Model from</title>
				<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<title level="m" type="main">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</title>
		<author>
			<persName><forename type="first">Jacob</forename><surname>Devlin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ming-Wei</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Kenton</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Kristina</forename><surname>Toutanova</surname></persName>
		</author>
		<idno>arXiv:cs.CL/1810.04805</idno>
		<imprint>
			<date type="published" when="2019">2019. 2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<monogr>
		<author>
			<persName><forename type="first">Adam</forename><surname>Geitgey</surname></persName>
		</author>
		<ptr target="https://github.com/ageitgey/face_recognition" />
		<title level="m">Face Recognition</title>
				<imprint>
			<date type="published" when="2018">2018. 2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Identity mappings in deep residual networks</title>
		<author>
			<persName><forename type="first">Kaiming</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Xiangyu</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Shaoqing</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jian</forename><surname>Sun</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">European conference on computer vision</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="630" to="645" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<title level="m" type="main">Deep metric learning using Triplet network</title>
		<author>
			<persName><forename type="first">Elad</forename><surname>Hoffer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Nir</forename><surname>Ailon</surname></persName>
		</author>
		<idno>arXiv:cs.LG/1412.6622</idno>
		<imprint>
			<date type="published" when="2018">2018. 2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">A comprehensive survey of deep learning for image captioning</title>
		<author>
			<persName><forename type="first">Ferdous</forename><surname>Md Zakir Hossain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Mohd</forename><surname>Sohel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Hamid</forename><surname>Fairuz Shiratuddin</surname></persName>
		</author>
		<author>
			<persName><surname>Laga</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ACM Computing Surveys (CSUR)</title>
		<imprint>
			<biblScope unit="volume">51</biblScope>
			<biblScope unit="issue">6</biblScope>
			<biblScope unit="page" from="1" to="36" />
			<date type="published" when="2019">2019. 2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Densely connected convolutional networks</title>
		<author>
			<persName><forename type="first">Gao</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Zhuang</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Laurens</forename><surname>Van Der Maaten</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Kilian</forename><forename type="middle">Q</forename><surname>Weinberger</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2017">2017</date>
			<biblScope unit="page" from="4700" to="4708" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">LightGBM: A Highly Efficient Gradient Boosting Decision Tree</title>
		<author>
			<persName><forename type="first">Guolin</forename><surname>Ke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Qi</forename><surname>Meng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Thomas</forename><surname>Finley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Taifeng</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Wei</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Weidong</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Qiwei</forename><surname>Ye</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Tie-Yan</forename><surname>Liu</surname></persName>
		</author>
		<ptr target="https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf" />
	</analytic>
	<monogr>
		<title level="m">Advances in Neural Information Processing Systems</title>
				<editor>
			<persName><forename type="first">I</forename><surname>Guyon</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">U</forename><forename type="middle">V</forename><surname>Luxburg</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Bengio</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Wallach</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Fergus</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Vishwanathan</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Garnett</surname></persName>
		</editor>
		<imprint>
			<publisher>Curran Associates, Inc</publisher>
			<date type="published" when="2017">2017</date>
			<biblScope unit="volume">30</biblScope>
			<biblScope unit="page" from="3146" to="3154" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">News Images in MediaEval</title>
		<author>
			<persName><forename type="first">Benjamin</forename><surname>Kille</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Andreas</forename><surname>Lommatzsch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Özlem</forename><surname>Özgöbek</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. of the MediaEval 2020 Workshop. Online</title>
				<meeting>of the MediaEval 2020 Workshop. Online</meeting>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<author>
			<persName><forename type="first">Davis</forename><forename type="middle">E</forename><surname>King</surname></persName>
		</author>
		<ptr target="https://github.com/davisking/dlib-models" />
		<title level="m">dlib-models</title>
				<imprint>
			<date type="published" when="2018">2018. 2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale</title>
		<author>
			<persName><forename type="first">Alina</forename><surname>Kuznetsova</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Hassan</forename><surname>Rom</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Neil</forename><surname>Alldrin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jasper</forename><surname>Uijlings</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ivan</forename><surname>Krasin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jordi</forename><surname>Pont-Tuset</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Shahab</forename><surname>Kamali</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Stefan</forename><surname>Popov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Matteo</forename><surname>Malloci</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Alexander</forename><surname>Kolesnikov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Tom</forename><surname>Duerig</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Vittorio</forename><surname>Ferrari</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IJCV</title>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<title level="m" type="main">An Empirical Evaluation of doc2vec with Practical Insights into Document Embedding Generation</title>
		<author>
			<persName><forename type="first">Han</forename><surname>Jey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Timothy</forename><surname>Lau</surname></persName>
		</author>
		<author>
			<persName><surname>Baldwin</surname></persName>
		</author>
		<idno>arXiv:cs.CL/1607.05368</idno>
		<imprint>
			<date type="published" when="2016">2016. 2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<author>
			<persName><forename type="first">Patrice</forename><surname>Lopez</surname></persName>
		</author>
		<ptr target="https://github.com/kermitt2/entity-fishing" />
		<title level="m">Entity Fishing</title>
				<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">BabelNet: Building a very large multilingual semantic network</title>
		<author>
			<persName><forename type="first">Roberto</forename><surname>Navigli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Simone</forename><surname>Paolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ponzetto</forename></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 48th annual meeting of the association for computational linguistics</title>
				<meeting>the 48th annual meeting of the association for computational linguistics</meeting>
		<imprint>
			<date type="published" when="2010">2010</date>
			<biblScope unit="page" from="216" to="225" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<monogr>
		<title level="m" type="main">The Connection between the Text and Images of News Articles: New Insights for Multimedia Analysis</title>
		<author>
			<persName><surname>Nhj Oostdijk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Erkan</forename><surname>Van Halteren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Martha</forename><forename type="middle">A</forename><surname>Basar</surname></persName>
		</author>
		<author>
			<persName><surname>Larson</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<author>
			<persName><forename type="first">Tal</forename><surname>Ridnik</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Hussam</forename><surname>Lawen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Asaf</forename><surname>Noy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Itamar</forename><surname>Friedman</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2003.13630</idno>
		<title level="m">TResNet: High Performance GPU-Dedicated Architecture</title>
				<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Mobilenetv2: Inverted residuals and linear bottlenecks</title>
		<author>
			<persName><forename type="first">Mark</forename><surname>Sandler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Andrew</forename><surname>Howard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Menglong</forename><surname>Zhu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Andrey</forename><surname>Zhmoginov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Liang-Chieh</forename><surname>Chen</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="4510" to="4520" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<monogr>
		<title level="m" type="main">Very deep convolutional networks for large-scale image recognition</title>
		<author>
			<persName><forename type="first">Karen</forename><surname>Simonyan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Andrew</forename><surname>Zisserman</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1409.1556</idno>
		<imprint>
			<date type="published" when="2014">2014. 2014</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<author>
			<persName><forename type="first">Christian</forename><surname>Szegedy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Sergey</forename><surname>Ioffe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Vincent</forename><surname>Vanhoucke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Alex</forename><surname>Alemi</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1602.07261</idno>
		<title level="m">Inception-v4, inception-resnet and the impact of residual connections on learning</title>
				<imprint>
			<date type="published" when="2016">2016. 2016</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b27">
	<monogr>
		<title level="m" type="main">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</title>
		<author>
			<persName><forename type="first">Mingxing</forename><surname>Tan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Quoc</surname></persName>
		</author>
		<author>
			<persName><surname>Le</surname></persName>
		</author>
		<idno>arXiv:cs.LG/1905.11946</idno>
		<imprint>
			<date type="published" when="2020">2020. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<monogr>
		<title level="m" type="main">Show, Attend and Tell: Neural Image Caption Generation with Visual Attention</title>
		<author>
			<persName><forename type="first">Kelvin</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jimmy</forename><surname>Ba</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ryan</forename><surname>Kiros</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Kyunghyun</forename><surname>Cho</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Aaron</forename><surname>Courville</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ruslan</forename><surname>Salakhutdinov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Richard</forename><surname>Zemel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Yoshua</forename><surname>Bengio</surname></persName>
		</author>
		<idno>arXiv:cs.LG/1502.03044</idno>
		<imprint>
			<date type="published" when="2016">2016. 2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<analytic>
		<title level="a" type="main">Learning transferable architectures for scalable image recognition</title>
		<author>
			<persName><forename type="first">Barret</forename><surname>Zoph</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Vijay</forename><surname>Vasudevan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Jonathon</forename><surname>Shlens</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Quoc V</forename><surname>Le</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="8697" to="8710" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
