<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Multimodal image geocoding: the 2013 RECOD&apos;s approach</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Lin</forename><forename type="middle">Tzy</forename><surname>Li</surname></persName>
							<email>lintzyli@ic.unicamp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Telecommunications Res. &amp; Dev. Center</orgName>
								<orgName type="institution">CPqD Foundation</orgName>
								<address>
									<postCode>13086-902</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Jurandy</forename><surname>Almeida</surname></persName>
							<email>jurandy@ic.unicamp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Institute of Science and Technology</orgName>
								<orgName type="institution">Federal University of Sao Paulo (UNIFESP)</orgName>
								<address>
									<addrLine>Sao Jose dos Campos</addrLine>
									<postCode>12231-280</postCode>
									<settlement>-Brazil</settlement>
									<region>SP</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Otávio</forename><forename type="middle">A B</forename><surname>Penatti</surname></persName>
							<email>penatti@ic.unicamp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Rodrigo</forename><forename type="middle">T</forename><surname>Calumby</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="department">Dept. of Exact Sciences</orgName>
								<orgName type="institution">University of Feira de Santana (UEFS)</orgName>
								<address>
									<addrLine>Feira de Santana</addrLine>
									<postCode>44036-900</postCode>
									<settlement>BA -Brazil</settlement>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Daniel</forename><forename type="middle">C G</forename><surname>Pedronette</surname></persName>
							<email>daniel@rc.unesp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Dept. of Stat., Applied Math. and Computing</orgName>
								<orgName type="institution">Universidade Estadual Paulista (UNESP)</orgName>
								<address>
									<addrLine>Rio Claro</addrLine>
									<postCode>SP -, 13506-900</postCode>
									<settlement>Brazil</settlement>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Marcos</forename><forename type="middle">A</forename><surname>Gonçalves</surname></persName>
							<affiliation key="aff5">
								<orgName type="department">Dept. of Computer Science</orgName>
								<orgName type="institution">Federal University of Minas Gerais (UFMG)</orgName>
								<address>
									<postCode>MG -, 31270-010</postCode>
									<settlement>Belo HorizonteBrazil</settlement>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ricardo</forename><surname>Da</surname></persName>
						</author>
						<author>
							<persName><forename type="first">S</forename><surname>Torres</surname></persName>
							<email>rtorres@ic.unicamp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Institute of Computing</orgName>
								<orgName type="laboratory">RECOD Lab</orgName>
								<orgName type="institution">University of Campinas (UNICAMP)</orgName>
								<address>
									<postCode>13083-852</postCode>
									<settlement>Campinas</settlement>
									<region>SP -Brazil</region>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Multimodal image geocoding: the 2013 RECOD&apos;s approach</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">10CA1C4C14CE6D21B68A873C5FCA6567</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-19T17:59+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This work describes the approach used by the RECOD team in the MediaEval Placing Task of 2013, in which we were required to develop an automatic scheme to assign geographical locations to images. Our approach is multimodal, considering textual and visual descriptors, which are combined by a rank aggregation strategy. We estimate the location of test images based on the coordinates of top-ranked images in the list of combined results.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">INTRODUCTION</head><p>Geocoding multimedia material has gained great attention in the latest years given the importance of providing richer services for users, like placing information on maps. Image geocoding is the objective of the Placing Task in 2013, i.e., it requires participants to assign geographical locations to images. Details about the Placing task, its dataset, and the evaluation protocol can be found in <ref type="bibr" target="#b1">[1]</ref>.</p><p>In this paper, we present our multimodal approach that combines different textual and visual descriptors uniformly. We combine them using a rank aggregation strategy, previously introduced in <ref type="bibr" target="#b4">[4]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">PROPOSED APPROACH</head><p>We handled the task of automatically assigning a geographical location to images using nearest neighbor searches on aggregated ranked lists, which combine textual and visual features. The strengths of our approach are its simplicity and its power to combine multiple description modalities.</p><p>For evaluation purposes in the training phase, we have selected a validation set of 5,000 images from the development set of around 8.5 million images. First, each photo from the development set was assigned to a fixed cell of 1-by-1 degree based on its ground truth latitude and longitude. Then, the resulting grid was summarized by the total of photos (density) in each cell regarding to the dataset size. Finally, the evaluation images (5,000 photos) were randomly picked up from each cell, by taking into account its density.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Features</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Textual</head><p>From textual metadata, we used only the photo tags to compute similarities between the images. The tags were stemmed and stopwords were removed. The text similarity</p><p>Copyright is held by the author/owner(s).</p><p>MediaEval 2013 Workshop, October 18-19, 2013, Barcelona, Spain functions used were BM25 and TF-IDF, as implemented by the Lucene API.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Visual</head><p>Given the large dataset, we had to select carefully the descriptors to be used. Initially, we have evaluated some of the descriptors provided with the dataset, like: color and edge directivity descriptor (CEDD), scalable color (SCD), gabor filter. Using the validation set, we have noticed that the best results were achieved by CEDD. Although SCD has shown the best results in <ref type="bibr" target="#b2">[2]</ref>, in our validation set, it did not performed well for our geocoding approach.</p><p>Additionally to CEDD, we used BIC (border/interior pixel classification). This descriptor was chosen due to its good results in large scale experiments <ref type="bibr">[5]</ref>. For this, we downloaded the whole photo dataset, resizing the images to have at most 100 thousand pixels, as suggested by <ref type="bibr" target="#b6">[6]</ref> for large scale experiments, and extracted the 128-dimensional BIC feature vector of each image. The Manhattan distance (L1) was used for both BIC and CEDD.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Rank aggregation</head><p>As last year, we used a rank aggregation strategy to combine different descriptors <ref type="bibr" target="#b3">[3]</ref>. For this year, due to the size of the development set, we created a ranked list limited to the top 1,000 most similar photos for each test image.</p><p>We have used an aggregation function similar to sima (numerator is m instead of 2) proposed in <ref type="bibr" target="#b3">[3]</ref>. When the intersection of top-1000 lists computed by different features are small, the size of the final aggregated list tends to (m×1000), being m the number of features combined. We select the top-1000 images that present the highest aggregated score as the output of the rank aggregation step.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3">Geocoding</head><p>For geocoding the test images, we have used a nearest neighbor approach. We used the development set (∼8.5 million images) as geo-profiles and each test image was compared to the whole development set. For comparing the images, we have used each type of feature independently (textual or visual). For a given test image, the ranked list of each feature is produced. All the lists are then combined by our rank aggregation strategy and the final ranked list is generated. The lat/long of the first image (most similar) in this final list is assigned to the test image.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">OUR SUBMISSIONS &amp; RESULTS</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Submitted runs</head><p>Our submissions for this year are: run1: combines 2 textual descriptors: BM25 + TF-IDF; run2: combines 2 visual descriptors: BIC + CEDD; run3: one visual descriptor: BIC; run4: combines 2 textual and 2 visual descriptors: BM25 + TF-IDF + BIC + CEDD; run5: combines 4 textual descriptors: BM25 + TF-IDF 1 .</p><p>Runs 1 and 5 used only textual features. Thus, for test images without tags, there was no way to apply our similarity ranked list approach. As post-processing, we randomly selected an item from the development set to transfer its latitude and longitude to the test image.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Results</head><p>Besides the organizers' standard evaluation metric, we also applied the WAS score we proposed in <ref type="bibr" target="#b4">[4]</ref>. This evaluation metric gives an overview of a method's performance expressed by a score between [0,1], 0 being very bad and 1 indicating a perfect estimate with a higher weight assigned to more precise results. The WAS takes into account every single result of the whole test set to indicate and summarize the level of precision of an evaluated method as a whole.</p><p>Let d(i) be the geographic distance between the predicted and the ground truth location of the image i. The proposed score for the result of a given test image i is defined as:</p><formula xml:id="formula_0">score(i) = 1 − log(1+d(i)) log(1+Rmax)</formula><p>, where Rmax is the maximum distance between any two points on the Earth's surface (half of Earth's circumference at the Equator is 20,027.5 km).</p><p>Let D be a test dataset with n images whose locations need to be predicted. The overall score for the predictions of a method m is defined as:</p><formula xml:id="formula_1">W AS(m) = n i=1 score(i) n</formula><p>. As we can observe in Table <ref type="table" target="#tab_1">2</ref>, the test runs based solely on textual information yielded the best results (runs 1, 4, and 5), while those based only on visual descriptors presented low accuracy. The possible reason is the semantic gap, as there might be many different places with similar visual appearance, specially in a large dataset like the one used for training. Another potential issue was the large number of ties in the first positions of ranked lists of visual descriptors. Given our 1-nn geocoding approach, this probably degraded our results. However, we can see that by combining 1 Non-English tags were translated to English using the Google Translate service and combined with the original tags.</p><p>BIC+CEDD (run 2) we improve the results of BIC alone (run 3). The combination of textual and visual descriptors (run 4) was slightly worse than the textual descriptors isolated. One possible reason is the large difference between textual and visual results.</p><p>Observe that for the test set (Table <ref type="table" target="#tab_1">2</ref>), our results were quite different for our validation set (Table <ref type="table" target="#tab_0">1</ref>), mainly for the visual features. While in the test3 set, BIC achieved less than 1% in the 1km radius, in the validation set, it presented 15.32%. Because of this, in the validation set, the fusion (run 4) results improved over run 1. The huge difference between validation and test results might be due to a property of the test set not considered when building the validation set: the users who contributed for the photos in the training set are different from those who contributed for the photos in the test set.</p><p>Regarding the distribution of test results, for the visual descriptors (runs 2 and 3), the 1st Quartile shows that 25% of the items were geocoded at most 1,900km from the correct location. On the other hand, for the textual descriptors and their combinations (runs 1, 4, and 5), 25% of the items are very close to their correct locations (less than 3km).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">CONCLUSIONS</head><p>Our best results were observed for the methods based only on textual description. For them, we could geocode within 1km radius around 20% of the testing set (test3). Considering visual descriptors, the main challenge this year was the large scale dataset, which poses time and space constraints in the descriptors to be used. Our rank aggregation strategy, for the test set, was only effective for combining textual descriptors. Combining textual and visual descriptors did not improve the results. As future work, we would like to evaluate a more elaborate geocoding approach, similar to the scheme used to create our validation set, for example.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1 :</head><label>1</label><figDesc>Validation set results.</figDesc><table><row><cell cols="3">Precision Run 1 Run 2</cell><cell cols="3">Run 3 Run 4 Run 5</cell></row><row><cell cols="2">1km 64.56%</cell><cell>16.86%</cell><cell cols="3">15.32% 68.82% 64.62%</cell></row><row><cell cols="2">10km 73.64%</cell><cell>17.68%</cell><cell cols="3">16.10% 75.90% 73.60%</cell></row><row><cell cols="2">100km 77.58%</cell><cell>18.64%</cell><cell cols="3">17.04% 78.94% 77.58%</cell></row><row><cell cols="2">500km 80.20%</cell><cell>22.86%</cell><cell cols="3">13.40% 81.10% 80.22%</cell></row><row><cell cols="2">1000km 82.18%</cell><cell>28.32%</cell><cell cols="3">20.12% 82.74% 82.32%</cell></row><row><cell cols="3">WAS score 0.7866 0.3053</cell><cell cols="3">0.2889 0.8019 0.7866</cell></row><row><cell></cell><cell cols="3">Distance distribution</cell><cell></cell><cell></cell></row><row><cell>1st Quartile</cell><cell>0.00</cell><cell>698.40</cell><cell>885.30</cell><cell>0.00</cell><cell>0.00</cell></row><row><cell>Median</cell><cell cols="3">0.03 5,499.40 5,835.80</cell><cell>0.00</cell><cell>0.04</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2 :</head><label>2</label><figDesc>Test results using test3 set (53,000 items).</figDesc><table><row><cell cols="2">Precision Run 1</cell><cell>Run 2</cell><cell cols="2">Run 3 Run 4 Run 5</cell></row><row><cell cols="2">1km 20.14%</cell><cell>0.37%</cell><cell cols="2">0.28% 20.11% 18.82%</cell></row><row><cell cols="2">10km 37.60%</cell><cell>0.80%</cell><cell cols="2">0.67% 37.10% 35.93%</cell></row><row><cell cols="2">100km 47.66%</cell><cell>1.69%</cell><cell cols="2">1.51% 46.97% 45.97%</cell></row><row><cell cols="2">500km 56.62%</cell><cell>6.73%</cell><cell cols="2">6.25% 55.83% 55.74%</cell></row><row><cell cols="2">1000km 63.17%</cell><cell>14.32%</cell><cell cols="2">13.78% 62.26% 62.43%</cell></row><row><cell cols="2">WAS score 0.5240</cell><cell>0.1653</cell><cell cols="2">0.1623 0.5190 0.5128</cell></row><row><cell></cell><cell cols="3">Distance distribution</cell></row><row><cell>1st Quartile</cell><cell cols="3">1.73 1,869.00 1,962.00</cell><cell>1.76</cell><cell>2.05</cell></row><row><cell>Median</cell><cell cols="3">168.22 6,632.00 6,729.00</cell><cell>196.79</cell><cell>225.67</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>We thank the support of FAPESP (2011/11171-5, 2009/10554-8), CNPq (306580/2012-8, 484254/2012-0), CAPES, FAPEMIG, Samsung, ACM SIGIR, and MediaEval organizers.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<monogr>
		<title/>
		<author>
			<persName><surname>References</surname></persName>
		</author>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Working Notes for the Placing Task at MediaEval</title>
		<author>
			<persName><forename type="first">C</forename><surname>Hauff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Thomee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Trevisiol</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">MediaEval 2013 Workshop</title>
				<imprint>
			<date type="published" when="2013">October 18-19 2013</date>
			<biblScope unit="volume">1043</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Multimodal geo-tagging in social media websites using hierarchical spatial segmentation</title>
		<author>
			<persName><forename type="first">P</forename><surname>Kelm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Schmiedeke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Sikora</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Workshop on Location-Based Social Networks</title>
				<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="page" from="32" to="39" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">A multimodal approach for video geocoding</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">T</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Almeida</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">C G</forename><surname>Pedronette</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><forename type="middle">A B</forename><surname>Penatti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Da</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Torres</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes Proc. MediaEval Workshop</title>
				<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="volume">927</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">A rank aggregation framework for video multimodal geocoding</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">T</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">C G</forename><surname>Pedronette</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Almeida</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><forename type="middle">A</forename><surname>Penatti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">T</forename><surname>Calumby</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">D S</forename><surname>Torres</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Mult. Tools and App</title>
		<imprint>
			<biblScope unit="page" from="1" to="37" />
			<date type="published" when="2013">2013</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Comparative study of global color and texture descriptors for web image retrieval</title>
		<author>
			<persName><forename type="first">O</forename><forename type="middle">A B</forename><surname>Penatti</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Valle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Da</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Torres</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">J. Vis. Comm. and Image Repr</title>
		<imprint>
			<biblScope unit="volume">23</biblScope>
			<biblScope unit="issue">2</biblScope>
			<biblScope unit="page" from="359" to="380" />
			<date type="published" when="2012">2012</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Towards good practice in large-scale learning for image classification</title>
		<author>
			<persName><forename type="first">F</forename><surname>Perronnin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Akata</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Harchaoui</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Schmid</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CVPR</title>
				<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="page" from="3482" to="3489" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
