<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Results of the Ontology Alignment Evaluation Initiative 2009?</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Jérôme Euzenat</string-name>
          <email>Jerome.Euzenat@inrialpes.fr</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alfio Ferrara</string-name>
          <email>ferrara@dico.unimi.it</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Laura Hollink</string-name>
          <email>laurah@few.vu.nl</email>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Antoine Isaac</string-name>
          <email>aisaac@few.vu.nl</email>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Cliff Joslyn</string-name>
          <email>cliff.joslyn@pnl.gov</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Véronique Malaisé</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christian Meilicke</string-name>
          <email>christian@informatik.uni-mannheim.de</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Andriy Nikolov</string-name>
          <email>a.nikolov@open.ac.uk</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Juan Pane</string-name>
          <email>pane@dit.unitn.it</email>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Marta Sabou</string-name>
          <email>r.sabou@open.ac.uk</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>François Scharffe</string-name>
          <email>Francois.Scharffe@inrialpes.fr</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Pavel Shvaiko</string-name>
          <email>pavel.shvaiko@infotn.it</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vassilis Spiliopoulos</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Heiner Stuckenschmidt</string-name>
          <email>heiner@informatik.uni-mannheim.de</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ondrˇej Šváb-Zamazal</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vojteˇch Svátek</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Cássia Trojahn</string-name>
          <email>Cassia.Trojahn@inrialpes.fr</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>George Vouros</string-name>
          <email>georgev@aegean.gr</email>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Shenghui Wang</string-name>
          <email>swang@few.vu.nl</email>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>INRIA &amp; LIG</institution>
          ,
          <addr-line>Montbonnot</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Pacific Northwest National Laboratory</institution>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>TasLab</institution>
          ,
          <addr-line>Informatica Trentina, Trento</addr-line>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>The Open university</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Universita degli studi di Milano</institution>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>University of Economics</institution>
          ,
          <addr-line>Prague</addr-line>
          ,
          <country country="CZ">Czech Republic</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>University of Mannheim</institution>
          ,
          <addr-line>Mannheim</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>University of Trento</institution>
          ,
          <addr-line>Povo, Trento</addr-line>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>University of the Aegean</institution>
          ,
          <country country="GR">Greece</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>Vrije Universiteit Amsterdam</institution>
          ,
          <country country="NL">The Netherlands</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2009</year>
      </pub-date>
      <abstract>
        <p>Ontology matching consists of finding correspondences between ontology entities. OAEI campaigns aim at comparing ontology matching systems on precisely defined test cases. Test cases can use ontologies of different nature (from expressive OWL ontologies to simple directories) and use different modalities, e.g., blind evaluation, open evaluation, consensus. OAEI-2009 builds over previous campaigns by having 5 tracks with 11 test cases followed by 16 participants. This paper is an overall presentation of the OAEI 2009 campaign.</p>
      </abstract>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1 Introduction</title>
      <p>
        The Ontology Alignment Evaluation Initiative1 (OAEI) is a coordinated international
initiative that organizes the evaluation of the increasing number of ontology matching
? This paper improves on the “Preliminary results” initially published in the on-site proceedings
of the ISWC workshop on Ontology Matching (OM-2009). The only official results of the
campaign, however, are on the OAEI web site.
1 http://oaei.ontologymatching.org
systems [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ]. The main goal of OAEI is to compare systems and algorithms on the same
basis and to allow anyone for drawing conclusions about the best matching strategies.
Our ambition is that from such evaluations, tool developers can learn and improve their
systems. The OAEI campaign provides the evaluation of matching systems on
consensus test cases.
      </p>
      <p>
        Two first events were organized in 2004: (i) the Information Interpretation and
Integration Conference (I3CON) held at the NIST Performance Metrics for Intelligent
Systems (PerMIS) workshop and (ii) the Ontology Alignment Contest held at the
Evaluation of Ontology-based Tools (EON) workshop of the annual International Semantic
Web Conference (ISWC) [23]. Then, unique OAEI campaigns occurred in 2005 at the
workshop on Integrating Ontologies held in conjunction with the International
Conference on Knowledge Capture (K-Cap) [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ], in 2006 at the first Ontology Matching
workshop collocated with ISWC [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ], in 2007 at the second Ontology Matching
workshop collocated with ISWC+ASWC [
        <xref ref-type="bibr" rid="ref11">11</xref>
        ], and in 2008, OAEI results were presented
at the third Ontology Matching workshop collocated with ISWC [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ]. Finally, in 2009,
OAEI results were presented at the fourth Ontology Matching workshop collocated with
ISWC, in Chantilly, Virginia USA2.
      </p>
      <p>We have continued previous years’ trend by having a large variety of test cases that
emphasize different aspects of ontology matching. This year we introduced two new
tracks that have been identified in the previous years:
oriented alignments in which the reference alignments are not restricted to
equivalence but also comprise subsumption relations;
instance matching dedicated to the delivery of alignment between instances as
necessary for producing linked data.</p>
      <p>This paper serves as an introduction to the evaluation campaign of 2009 and to the
results provided in the following papers. The remainder of the paper is organized as
follows. In Section 2 we present the overall testing methodology that has been used.
Sections 3-10 discuss in turn the settings and the results of each of the test cases.
Section 11 evaluates, across all tracks, the participant results with respect to their capacity
to preserve the structure of ontologies. Section 12 overviews lessons learned from the
campaign. Finally, Section 13 outlines future plans and Section 14 concludes the paper.
2</p>
    </sec>
    <sec id="sec-2">
      <title>General methodology</title>
      <p>We first present the test cases proposed this year to OAEI participants. Then, we
describe the three steps of the OAEI campaign and report on the general execution of the
campaign. In particular, we list participants and the tests they considered.
2.1</p>
      <sec id="sec-2-1">
        <title>Tracks and test cases</title>
        <p>This year’s campaign has consisted of 5 tracks gathering 11 data sets and different
evaluation modalities.</p>
        <sec id="sec-2-1-1">
          <title>2 http://om2009.ontologymatching.org</title>
          <p>The benchmark track (§3): Like in previous campaigns, a systematic benchmark
series has been produced. The goal of this benchmark series is to identify the areas in
which each matching algorithm is strong and weak. The test is based on one
particular ontology dedicated to the very narrow domain of bibliography and a number
of alternative ontologies of the same domain for which alignments are provided.
The expressive ontologies track offers ontologies using OWL modeling capabilities:
Anatomy (§4): The anatomy real world case is about matching the Adult Mouse
Anatomy (2744 classes) and the NCI Thesaurus (3304 classes) describing the
human anatomy.</p>
          <p>Conference (§5): Participants are asked to find all correct correspondences
(equivalence and/or subsumption) and/or ‘interesting correspondences’ within a
collection of ontologies describing the domain of organizing conferences (the
domain being well understandable for every researcher). Results are evaluated
a posteriori in part manually and in part by data-mining techniques and
logical reasoning techniques. They are also evaluated against reference alignments
based on a subset of the whole collection.</p>
          <p>The directories and thesauri track proposes web directories, thesauri and generally
less expressive resources:
Fishery gears: This test case features four different classification schemes,
expressed in OWL, adopted by different fishery information systems in FIM
division of FAO. An alignment performed on this 4 schemes should be able to
spot out equivalence, or a degree of similarity between the fishing gear types
and the groups of gears, so as to enable a future exercise of data aggregation
across systems.</p>
          <p>Directory (§6): The directory real world case consists of matching web sites
directories (like open directory or Yahoo’s). It is more than 4 thousand elementary
tests.</p>
          <p>Library (§7): Three large SKOS subject heading lists for libraries have to be
matched using relations from the SKOS vocabulary. Results are evaluated on
the basis of (i) a partial reference alignment (ii) using the alignments to
reindex books from one vocabulary to the other.</p>
        </sec>
      </sec>
      <sec id="sec-2-2">
        <title>Oriented alignments (benchmark-subs §8) :</title>
        <p>This track focuses on the evaluation of alignments that contain other relations than
equivalences.</p>
        <p>Instance matching (§9): The instance data matching track aims at evaluating tools
able to identify similar instances among different datasets. It features Web datasets,
as well as a generated benchmark:</p>
      </sec>
      <sec id="sec-2-3">
        <title>Eprints-Rexa-Sweto/DBLP benchmark (ARS) three datasets containing in</title>
        <p>stances from the domain of scientific publications;
TAP-Sweto-Tesped-DBpedia three datasets covering several topics and
structured according to different ontologies;
IIMB A benchmark generated using one dataset and modifying it according to
various criteria.</p>
      </sec>
      <sec id="sec-2-4">
        <title>Very large crosslingual resources (§10): The purpose of this task (vlcr) is to</title>
        <p>match the Thesaurus of the Netherlands Institute for Sound and Vision (called
GTAA) to two other resources: the English WordNet from Princeton University
and DBpedia.</p>
        <p>Table 1 summarizes the variation in the results expected from these tests.</p>
        <p>For the first time this year we had to cancel two tracks, namely Fishery and
TAPSweto-Tesped-DBpedia due to the lack of participants. This is a pity for those who
have prepared these tracks, and we will investigate what led to this situation in order to
improve next year.</p>
        <p>test formalism
relations
confidence modalities
language
benchmarks OWL</p>
        <p>anatomy OWL
conference OWL-DL</p>
        <p>fishery OWL
directory OWL
library SKOS
+OWL
OWL
RDF
RDF
RDF
SKOS
+OWL
benchmarksubs
ars
tap
iimb
vlcr
=
=
=, &lt;=
=
=
exact-,narrow-,
broadMatch
=,&lt;,&gt;
=
=
=
exact-,
closeMatch
Ontologies to be matched and (where applicable) reference alignments have been
provided in advance during the period between June 1st and June 22nd, 2009. This gave
potential participants the occasion to send observations, bug corrections, remarks and
other test cases to the organizers. The goal of this preparatory period is to ensure that
the delivered tests make sense to the participants. The final test base was released on
July 6th. The data sets did not evolve after this period.
2.3</p>
      </sec>
      <sec id="sec-2-5">
        <title>Execution phase</title>
        <p>
          During the execution phase, participants used their systems to automatically match the
ontologies from the test cases. Participants have been asked to use one algorithm and the
same set of parameters for all tests in all tracks. It is fair to select the set of parameters
that provide the best results (for the tests where results are known). Beside parameters,
the input of the algorithms must be the two ontologies to be matched and any general
purpose resource available to everyone, i.e., no resource especially designed for the test.
In particular, participants should not use the data (ontologies and reference alignments)
from other test cases to help their algorithms. In most cases, ontologies are described
in OWL-DL and serialized in the RDF/XML format. The expected alignments are
provided in the Alignment format expressed in RDF/XML [
          <xref ref-type="bibr" rid="ref8">8</xref>
          ]. Participants also provided
the papers that are published hereafter and a link to their systems and their configuration
parameters.
2.4
        </p>
      </sec>
      <sec id="sec-2-6">
        <title>Evaluation phase</title>
        <p>The organizers have evaluated the alignments provided by the participants and returned
comparisons on these results.</p>
        <p>In order to ensure that it is possible to process automatically the provided results, the
participants have been requested to provide (preliminary) results by September 1st. In
the case of blind tests only the organizers did the evaluation with regard to the withheld
reference alignments.</p>
        <p>The standard evaluation measures are precision and recall computed against the
reference alignments. For the matter of aggregation of the measures we use weighted
harmonic means (weights being the size of the true positives). This clearly helps in the
case of empty alignments. Another technique that has been used is the computation of
precision/recall graphs so it was advised that participants provide their results with a
weight to each correspondence they found. New measures addressing some limitations
of precision and recall have also been used for testing purposes as well as measures
compensating for the lack of complete reference alignments.
2.5</p>
      </sec>
      <sec id="sec-2-7">
        <title>Comments on the execution</title>
        <p>After a decreased number of participants last year, this year the number increased again:
4 participants in 2004, 7 in 2005, 10 in 2006, 17 in 2007, 13 in 2008, and 16 in 2009.</p>
        <p>The number of covered runs has slightly increased: 53 in 2009, 50 in 2008, and 48
in 2007. This may be due to the increasing specialization of tests: some systems are
specifically designed for instance matching or for anatomy.</p>
        <p>We have had not enough time to systematically validate the results which had been
provided by the participants, but we run a few systems and we scrutinized some of the
results.</p>
        <p>The list of participants is summarized in Table 2. Similar to previous years not
all participants provided results for all tests. They usually did those which are easier
to run, such as benchmark, anatomy, directory, and conference. The variety of tests
and the short time given to provide results have certainly prevented participants from
considering more tests.</p>
        <p>The sets of participants is divided in two main categories: those who participated
in the instance matching track and those who participated in ontology matching tracks.
Only a few systems (DSSim and RiMOM) participated in both types of tracks.</p>
        <p>The summary of the results track by track is provided in the following sections.
3</p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>Benchmark</title>
      <p>The goal of the benchmark tests is to provide a stable and detailed picture of each
algorithm. For that purpose, the algorithms are run on systematically generated test
cases.</p>
      <p>p 6
aoodfl ragkAM txEAM ROAAM SVOAM iSSDm FEBM eeoRGM 2GGWW tachHM isapkom ilyL apPSOM iROMM SBOOM aaoxTM tlao
1
=
T</p>
      <p>System
The domain of this first test is Bibliographic references. It is based on a subjective view
of what must be a bibliographic ontology. There may be many different classifications
of publications, for example, based on area and quality. The one chosen here is common
among scholars and is based on publication categories; as many ontologies (tests
#301304), it is reminiscent to BibTeX.</p>
      <p>The systematic benchmark test set is built around one reference ontology and
many variations of it. The ontologies are described in OWL-DL and serialized in the
RDF/XML format. The reference ontology is that of test #101. It contains 33 named
classes, 24 object properties, 40 data properties, 56 named individuals and 20
anonymous individuals. Participants have to match this reference ontology with the variations.
Variations are focused on the characterization of the behavior of the tools rather than
having them compete on real-life problems. They are organized in three groups:
Simple tests (1xx) such as comparing the reference ontology with itself, with another
irrelevant ontology (the wine ontology used in the OWL primer) or the same
ontology in its restriction to OWL-Lite;
Systematic tests (2xx) obtained by discarding features from some reference ontology.</p>
      <p>It aims at evaluating how an algorithm behaves when a particular type of
information is lacking. The considered features were:
– Name of entities that can be replaced by random strings, synonyms, name with
different conventions, strings in another language than English;
– Comments that can be suppressed or translated in another language;
– Specialization hierarchy that can be suppressed, expanded or flattened;
– Instances that can be suppressed;
– Properties that can be suppressed or having the restrictions on classes
discarded;
– Classes that can be expanded, i.e., replaced by several classes or flattened.</p>
      <sec id="sec-3-1">
        <title>Four real-life ontologies of bibliographic references (3xx) found on the web and left</title>
        <p>mostly untouched (there were added xmlns and xml:base attributes).</p>
        <p>Since the goal of these tests is to offer some kind of permanent benchmarks to be
used by many, the test is an extension of the 2004 EON Ontology Alignment Contest,
whose test numbering it (almost) fully preserves.</p>
        <p>The tests are roughly the same as last year. We only suppressed some
correspondences that rendered the merged ontologies inconsistent (in 301 and 304) since an
increasing number of systems were able to test the consistency of the resulting alignments.</p>
        <p>The kind of expected alignments is still limited: they only match named classes and
properties, they mostly use the "=" relation with confidence of 1. Full description of
these tests can be found on the OAEI web site.
3.2</p>
      </sec>
      <sec id="sec-3-2">
        <title>Results</title>
        <p>Twelve systems participated in the benchmark track of this year’s campaign (see
Table 2). Three systems that had participated last year (CIDER, SAMBO, and SPIDER)
did not participate this year.</p>
        <p>Table 3 shows the results, by groups of tests. The results of last year are also
provided. We display the results of participants as well as those given by some simple edit
distance algorithm on labels (edna). The computed values are real precision and recall
and not an average of precision and recall. The full results are on the OAEI web site.</p>
        <p>As shown in Table 3, two systems are ahead: Lily and ASMOV, with aflood and
RiMOM as close followers (with GeRoME, AROMA, DSSim, and AgreementMaker –
which is referred as AgrMaker in the tables and figures – having intermediary
performance). Last year, ASMOV, Lily and RiMOM had the best performance, followed by
AROMA, DSSim, and aflood. No system had strictly lower performance than edna.</p>
        <p>Looking for each group of tests, in simple tests (1xx) all systems have similar
performance, excluding SOBOM and TaxoMap. Each algorithm has its best score with the
1xx test series. For systematic tests (2xx), which allows to distinguish the strengths of
algorithms, Lily and ASMOV are again ahead of the other systems. Finally, for real
cases (3xx), AgreementMaker and aflood provide the best results, with Lily, RiMOM,
ASMOV, AROMA, and DSSim as followers. There is no a unique best system for all
group cases.</p>
        <p>Looking for improvements in the systems participating both this year and in the
last campaign, GeRoMe and MapPSO have significantly improved their results both in
terms of precision and recall, while aflood provides better recall and AROMA improves
its results in real cases.</p>
        <p>
          The results have also been compared with the symmetric measure proposed in [
          <xref ref-type="bibr" rid="ref7">7</xref>
          ].
It is a generalisation of precision and recall in order to better discriminate systems
that slightly miss the target from those which are grossly wrong. This measure slightly
improves traditional precision and recall, which are displayed in Table 3 (“Symmetric
relaxed measures”). This year, MapPSO has significantly better symmetric precision
and recall than classical precision and recall, to the point that it is at the level of the best
-m 3x 2x 1x
ea x x x
n
4
4
        </p>
        <p>0 0 1
.9 .6 .9 .0
1 8 2 0
0 0 0 1
.7 .6 .7 .0
3 0 1 0
rco abT
r
tso aen
.7][ enb
c</p>
        <p>s
ttse tseym
P
re re
c f
. a
R ilg
ec n
.</p>
        <p>P
r
.eceR eadn
c
.</p>
        <p>P
rec afl
. o
eR od
c
systems. This may be due the kind of algorithm which is used, that misses the target,
but not by far.</p>
        <p>Figure 2 shows the precision and recall graphs of this year. These results are only
relevant for the results of participants who provide confidence measures different from
1 or 0 (see Table 2). This graph has been drawn with only technical adaptation of the
technique used in TREC. Moreover, due to lack of time, these graphs have been
computed by averaging the graphs of each of the tests (instead to pure precision and recall).
refalign</p>
        <p>These results and those displayed in Figure 1 single out the same group of systems,
Lily, ASMOV, aflood, and RiMOM which seem to perform these tests at the highest
level of quality. Of these, Lily and ASMOV have slightly better results than the two
others. So, this confirms the leadership that we observed on raw results.</p>
        <p>Like in the three previous campaigns, there is a gap between these systems and their
followers (GeRoME, AROMA, DSSim, and AgreementMaker).</p>
        <p>refalign
AgrMaker</p>
        <p>DSSim</p>
        <p>Lily
SOBOM
recall
edna
AROMA
GeRoMe
MapPSO
TaxoMap
aflood
ASMOV
kosimap
RiMOM</p>
        <p>Fig. 2. Precision/recall graphs for benchmarks. The results given by the participants are cut under
a threshold necessary for achieving n% recall and the corresponding precision is computed.
Systems for which these graphs are not meaningful (because they did not provide graded confidence
values) are drawn in dashed lines.</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>Anatomy</title>
      <p>
        Within the anatomy track we confront existing matching technology with real world
ontologies. Currently, we find such real world cases primarily in the biomedical domain,
where a significant number of ontologies have been built covering different aspects of
medical research. Due to the complexity and the specialized vocabulary of the domain,
matching biomedical ontologies is one of the hardest alignment problems.
The ontologies of the anatomy track are the NCI Thesaurus describing the human
anatomy, published by the National Cancer Institute (NCI)3, and the Adult Mouse
Anatomical Dictionary4, which has been developed as part of the Mouse Gene
Expression Database project. Both resources are part of the Open Biomedical Ontologies
(OBO). A detailed description of the data set has been given in the context of OAEI
2007 [
        <xref ref-type="bibr" rid="ref11">11</xref>
        ] and 2008 [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ].
      </p>
      <p>As proposed in 2008 the task of automatically generating an alignment has been
divided into four subtasks. Task #1 is obligatory for participants of the anatomy track,
while task #2, #3 and #4 are optional tasks.</p>
      <p>– For task #1 the matcher has to be applied with standard settings to obtain a result
that is as good as possible with respect to the expected F-measure.
– In task #2 / #3 an alignment has to be generated that favors precision over recall and
vice versa. Systems configurable with respect to these requirements will be more
useful in particular application scenarios.
– In task #4 we simulate that a group of domain experts created an incomplete
reference alignment Rp. Given both ontologies as well as Rp, a matching system should
be able to exploit the additional information encoded in Rp.</p>
      <p>
        Due to the harmonization of the ontologies applied in the process of generating a
reference alignment (see [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ] and [
        <xref ref-type="bibr" rid="ref11">11</xref>
        ]), a high number of rather trivial correspondences
(61%) can be found by simple string comparison techniques. At the same time, we
have a good share of non-trivial correspondences (39%). The partial reference
alignment used in subtrack #4 is the union of all trivial correspondences and 54 non-trivial
correspondences.
4.2
      </p>
      <sec id="sec-4-1">
        <title>Results</title>
        <p>In total, ten systems participated in the anatomy track (in 2007 there were eleven
participants, in 2008 nine systems participated). An overview is given in Table 4.
While the number of participants is stable, we find systems participating for the first
time (SOBOM, kosimap), systems re-entering the competition after a year of absence
(AgreementMaker, which is referred to as AgrMaker in the tables) and systems
continuously participating (ASMOV, DSSim, Lily, RiMOM, TaxoMap).
3 http://www.cancer.gov/cancerinfo/terminologyresources/
4 http://www.informatics.jax.org/searches/AMA_form.shtml
System
aflood
AgrMaker
AROMA
AOAS
ASMOV
DSSim
Falcon-AO
kosimap
Lily
Prior+
RiMOM
SAMBO
SOBOM
TaxoMap
X-SOM
avg. F-measure</p>
        <p>In Table 4 we have marked the participants with an F-measure 0:8 with a +
symbol. Unfortunately, the top performers of the last two years do not participate this
year (AOAS in 2007, SAMBO in 2008). In the last row of the table the average of the
obtained F-measures is shown. We observe significant improvements over time.
However, in each of the three years the top systems generated alignments with F-measure of
0:85. It seems that there is an upper bound which is hard to exceed.</p>
        <p>Runtime Due to the evaluation process of the OAEI, the submitted alignments have
been generated by the participants, who run the respective systems on their own
machines. Nevertheless, the resulting runtime measurements provide an approximate basis
for a useful comparison. In 2007, we observed significant differences with respect to the
stated runtimes. Lily required several days for completing the matching task and more
than half of the systems could not match the ontologies in less than one hour. In 2008
we already observed increased runtimes. This year’s evaluation revealed that only one
system still requires more than one hour. The fastest system is aflood (15 sec) followed
by AROMA, which requires approximately 1 minute. Notice that aflood is run with a
configuration optimized for runtime efficiency in task #1, it requires 4 minutes with a
configuration which aims at generating an optimal alignment used for #2, #3, and #4.
Detailed information about runtimes can be found in the second column of Table 5.
Results for subtracks #1, #2 and #3 Table 5 lists the results of the participants in
descending order with respect to the F-measure achieved for subtrack #1. In the first
two rows we find SOBOM and AgreementMaker. Both systems have very good results
and distance themselves from the remaining systems. SOBOM, although participating
for the first time, submitted the best result in 2009. The system seems to be optimized
SOBOM
AgrMaker
RiMOM
TaxoMap
DSSim
ASMOV
aflood
Lily
AROMA
kosimap
for generating a precise alignment, however, the submitted alignment contains also a
number of non trivial correspondences (see the column Recall+ for subtrack #1).5</p>
        <p>AgreementMaker generates a less precise alignment, but manages to output a higher
number of correct correspondences. None of the other systems detected a higher number
of non-trivial correspondences for both subtrack #1 and #3 in 2009. However, it cannot
top the SAMBO submission of 2008, which is known for its extensive use of biomedical
background knowledge.</p>
        <p>The RiMOM system is slightly worse with respect to the achieved F-measure
compared to its 2008 submission. The precision has been improved, however, this caused
a loss of recall and in particular a significant loss of recall+. Unfortunately, RiMOM
did not participate in subtask #3, so we cannot make statements about its strength in
detecting non-trivial correspondences based on a different configuration.</p>
        <p>The systems listed in the following columns achieve similar results with respect to
the overall quality of the generated alignments (F-measures between 0.72 and 0.76).
However, significant differences can be found in terms of the trade-off between
precision and recall. All systems except ASMOV and Lily favor precision over recall. Notice
that a F-measure of 0:755 can easily be achieved by constructing a highly precise
alignment without detecting any non-trivial correspondences. At the same time it is relatively
hard to generate an alignment with a F-measure of 0:755 that favors recall over
precision. Thus, the results of ASMOV and Lily have to be interpreted more positively than
indicated by the F-measure.</p>
        <p>
          The observation that it is not hard to construct a highly precise alignment with
acceptable recall is supported by the results of subtask #2, where we find relatively
similar results for all participants. In particular, it turned out that some systems (ASMOV,
DSSim) have their best F-measure in track #2. The evaluation results for aflood require
some additional explanations. aflood is run for track #1 with a configuration which
results in a significant reduction of the runtime (15 sec), while for track #2 and #3 the
5 Recall+ is defined as recall restricted to the subset of non trivial correspondences in the
reference alignment. A detailed definition can be found in the results paper of 2007 [
          <xref ref-type="bibr" rid="ref11">11</xref>
          ].
system required approximately 4 minutes due to different settings. Therefore, aflood
creates better alignments as solutions to subtask #2 and #3.
        </p>
        <p>In 2007 we were surprised by the good performance of the naive label
comparison approach. Again, we have to emphasize that this is to a large degree based on the
harmonization of the ontologies that has been applied in the context of generating the
reference alignment. Nevertheless, the majority of participants was able to top the
results of the trivial string matching approach this year.</p>
        <p>Results for subtrack #4 In the following we refer to an alignment generated for task
#1 resp. #4 as A1 resp. A4. This year we have chosen an evaluation strategy that differs
from the approach of the last year. We compare A1 [Rp resp. A4 [Rp with the reference
alignment R. Thus, we compare the situation where the partial reference alignment is
added after the matching process has been conducted against the situation where the
partial reference alignment is available as additional resource used within the matching
process. The results are presented in Table 6.</p>
        <p>System
SAMBOdtf2008
ASMOV
aflood#3
TaxoMap
AgrMaker</p>
        <p>-Precision
+0:020 0:837!0:856
+0:034 0:759!0:792
+0:005 0:838!0:843
+0:019 0:878!0:897
+0:128 0:870!0:998</p>
        <p>-Recall
+0:003 0:867!0:870</p>
        <p>0:018 0:808!0:790
+0:003 0:825!0:827
0:026 0:732!0:706
0:181 0:831!0:650</p>
        <p>-F-Measure
+0:011 0:852!0:863
+0:009 0:782!0:791
+0:004 0:831!0:835
0:008 0:798!0:790
0:063 0:850!0:787</p>
        <p>Four systems participated in task #4. These systems were aflood, AgreementMaker,
ASMOV and TaxoMap. In Table 6 we additionally added a row that displays the 2008
submission of SAMBOdtf, which had the best results for subtrack #4 in 2008. For
aflood we used A3 instead of A1 to allow a fair comparison, due to the fact that A1 was
generated with runtime optimization configuration.</p>
        <p>A first look at the results shows that all systems use the partial reference
alignment to increase the precision of their systems. Most of them them have slightly better
values for precision (between 0.5% and 3.4%), only AgreementMaker uses the
additional information in a way which has a stronger impact in terms of a significantly
increased precision. However, only three correspondences have been found that have
not been in the partial reference alignment previously6. Only SAMBOdtf and aflood
profit from the partial reference alignment by a slightly increased recall, while the other
systems wrongly filter out some correct correspondences. This might be based on two
specifics of the dataset. On the one hand the major part of the reference alignment
consists of trivial correspondences easily detectable by string matching algorithms, while
the unknown parts share a different characteristic. Any approach which applies
machine learning techniques to learn from the partial reference alignment is thus bound to
fail. On the other hand parts of the matched ontologies are incomplete with respect to
6 Notice that we only take correspondences between anatomical concepts into account.
subsumption axioms. As pointed out in [16], the completeness of the structure and the
correct use of the structural relations within the ontologies has an important influence
on the quality of the results. For these reasons it is extremely hard to use the partial
reference alignment in an appropriate way in subtask #4.
Although it is argued that domain related background knowledge is a crucial point in
matching biomedical ontologies (see for example [1; 20]), the results of 2009 raise
some doubts about this issue. While in 2007 and 2008 the competition was clearly
dominated by matching systems heavily exploiting background knowledge (UMLS),
this years top performer SOBOM uses none of these techniques. However, the strong
F-measure of SOBOM is mainly based on high precision. Comparing the alignments
generated by SAMBO in 2008 and SOBOM in 2009 it turns out that SAMBO detected
136 correct correspondences not found by SOBOM, while SOBOM finds 36 correct
correspondences not detected by SAMBO. Unfortunately, SOBOM did not participate
in subtrack #3. Thus, it is hard to assess its capability for detecting non-trivial
correspondences. The results of subtask #4 are disappointing at first sight. Since this kind of
task has been introduced in 2008, we expected better results in 2009. However, it turned
out again that only minor positive effects can be achieved. But, as already argued, the
task of subtrack #4 is hard and systems with acceptable results in subtrack #4 might
obtain good results under better conditions.
5</p>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>Conference</title>
      <p>The conference test set introduces matching several more-or-less expressive ontologies.
Within this track the results of participants are evaluated using diverse evaluation
methods. First, classical evaluation wrt. the reference alignment was made, for the ontology
pairs where this alignment is available. Second, posterior manual evaluation was made
for all ontology pairs using even sampling across all matchers. Third, the complete
results were submitted to a data mining tool for discovery of association hypotheses,
taking into account specific mapping patterns. Fourth, alignment incoherence was
analysed with the help of a logical reasoner.
5.1</p>
      <sec id="sec-5-1">
        <title>Test data</title>
        <p>The collection consists of fifteen ontologies in the domain of organizing conferences.
Ontologies have been developed within the OntoFarm project7. In contrast to last year’s
conference track, we alsoconsidered subsumption results in evaluation.</p>
        <p>The main features of this test set are:
– Generally understandable domain. Most ontology engineers are familiar with
organizing conferences. Therefore, they can create their own ontologies as well as
evaluate the alignment among their concepts with enough erudition.</p>
        <sec id="sec-5-1-1">
          <title>7 http://nb.vse.cz/~svatek/ontofarm.html</title>
          <p>– Independence of ontologies. Ontologies were developed independently and based
on different resources, they thus capture the issues in organizing conferences from
different points of view and with different terminologies.
– Relative richness in axioms. Most ontologies were equipped with DL axioms of
various kinds, which opens a way to use semantic matchers.</p>
          <p>Ontologies differ in numbers of classes, of properties, in their DL expressivity, but
also in underlying resources. Ten ontologies are based on tools supporting the task
of organizing conferences, two are based on experience of people with personal
participation in conference organization, and three are based on web pages of concrete
conferences.</p>
          <p>Participants were to provide all correct correspondences (equivalence and/or
subsumption) and/or “interesting correspondences” within a collection of ontologies
describing the domain of organizing conferences.</p>
          <p>This year, results of participants are evaluated by four different methods of
evaluation: evaluation based on reference alignment, manual labeling, data mining method,
and logical reasoning. In addition, we extended the reference alignment from the
previous year. Now we have 21 alignments, which correspond to the complete alignment
space between 7 ontologies from the data set. Manual evaluation produced statistics
such as precision and will also serve as input into evaluation based on data mining and
will help in the process of improving and building a reference alignment. Results of
participants are checked with regard to their incoherency. These evaluation methods are
concisely described at the track result page.
5.2</p>
        </sec>
      </sec>
      <sec id="sec-5-2">
        <title>Results</title>
        <p>We had seven participants: aflood, AgreementMaker (AgrMaker), AMExt (an extended
version of AgreementMaker), AROMA, ASMOV, DSSim, and kosimap. Here are some
basic data, besides evaluations:
– All participants delivered all 105 alignments, except for aflood, which delivered
103 alignments.
– Two participants (ASMOV and DSSim) delivered not only equivalence
correspondences but also subsumptions.
– aflood and DSSim matchers delivered “certain” correspondences; other matchers
delivered correspondences with confidence values between 0 and 1.</p>
      </sec>
      <sec id="sec-5-3">
        <title>Evaluation based on reference alignment We evaluated the results of participants</title>
        <p>against a reference alignment. In the case of ASMOV and DSSim we filtered out
subsumptions. It includes all pairwise combinations of different 7 ontologies (21
alignments).</p>
        <p>In Table 7, there are traditional precision, recall, and F-measure computed for three
different thresholds of certainty factor (0.2, 0.5, and 0.7).</p>
        <p>For better comparison we established the confidence threshold which provides the
highest average F-measure (Table 8). Precision, Recall, and F-measure are given for this
optimal confidence threshold. The dependency of F-measure on confidence threshold</p>
        <p>Fig. 3. F-measures depending of confidence.
can be seen from Figure 3. There are two asterisks in the column of confidence threshold
for matchers which did not provide graded confidence.</p>
        <p>In conclusion, the matcher with the highest average F-measure (:57) is that of
AgreementMaker at :75. However we should take into account that this evaluation has
been made over small part of all alignments (one fifth).</p>
        <p>Comparison with previous year We evaluated the results of participants of OAEI 2008
(ASMOV, DSSim and Lily) against the new reference alignments. For these three
matchers from OAEI 2008, we found an optimal confidence threshold in terms of
highest average F-measure, see Table 9. In the case of DSSim there is an asterisk because
this matcher did not provide graded confidence.</p>
        <p>In conclusion, the matcher with the highest average F-measure (0.49) was the
DSSim. However we should take into account that this evaluation has been made over
small part of all alignments (one fifth). We can also compare performance of
participants of both years ASMOV and DSSim. While in terms of highest average F-measure
ASMOV improved from 43% to 47%, DSSim declined from 49% to 22%. We can also
see that ASMOV matcher from OAEI 2009 delivered more correspondences with lower
confidence than in OAEI 2008.</p>
        <p>matcher
ASMOV
DSSim</p>
        <p>Lily
confidence threshold Prec. Rec. FMeas.
Restricted semantic precision and recall Furthermore, we computed restricted
semantic precision and recall using a tool from University of Mannheim [12]. We took into
account matchers which delivered correspondences with subsumption relations, i.e.,
ASMOV and DSSim. In Table 10 there are two different semantics variants (natural and
pragmatic) of restricted semantic precision and recall computed for confidence
threshold 0:238.</p>
        <p>matcher
ASMOV
DSSim</p>
        <p>natural
Prec. Rec.</p>
        <p>In conclusion, from Table 10 we can see that considering correspondences with
subsumption relations ASMOV has better performance in both precision and recall,
whereas DSSim has much better recall at expense of lower precision.
8 This an optimal confidence threshold in terms of highest F-measure for ASMOV. DSSim does
not have graded confidence.</p>
      </sec>
      <sec id="sec-5-4">
        <title>Evaluation based on posterior manual labeling This year we take the most secure,</title>
        <p>i.e., with highest confidence, correct correspondences as a population for each matcher.
It means we evaluate 150 correspondences per matcher randomly chosen from all
correspondences of all 105 alignments with confidence 1.0 (sampling). Because AROMA,
ASMOV and kosimap do not have enough correspondences with 1.0 confidence we
take 150 correspondences with highest confidence. In the case of AROMA it was not
possible to distinguish between all 153 correspondences so we sampled over its
population.</p>
        <p>In table 11 you can see approximated precisions for each matcher over its
population of best correspondences. N is a population of all the best correspondences for
one matcher. n is a number of randomly chosen correspondences so as to have 150 best
correspondences for each matcher. TP is a number of correct correspondences from the
sample, and P* is an approximation of precision for the correspondences in each
population; additionally there is a margin of error computed as: p(Np=Nn) 1 based on [24].
matcher
aflood</p>
        <p>AgrMaker AMExt AROMA ASMOV</p>
        <p>DSSim kosimap
N
n
TP
P*</p>
        <p>From table 11 we can conclude that kosimap has the best precision (.96) over its
150 more confident correspondences.</p>
      </sec>
      <sec id="sec-5-5">
        <title>Evaluation based on data mining supported with mapping patterns (based on</title>
        <p>[19]). As opposed to ontology design patterns9, which usually concern one ontology,
mapping patterns deal with (at least) two ontologies. Mapping patterns reflect the
internal structure of ontologies as well as correspondences across the ontologies.</p>
        <p>We recognise nine mapping patterns:
– MP1 (“Parent-child triangle”): it consists of an equivalence correspondence
between classes A and B and an equivalence correspondence between A and a child
of B, where A and B are from different ontologies.
– MP2 (“Mapping along taxonomy”): it consists of simultaneous equivalence
correspondences between parents and between children.
– MP3 (“Sibling-sibling triangle”): it consists of simultaneous correspondences
between class A and two sibling classes C and D where A is from one ontology and
C and D are from another ontology.
– MP4: it is inspired by the ’class-by-attribute’ correspondence pattern, where the
class in one ontology is restricted to only those instances having a particular value
for a a given attribute/relation.
9 See http://ontologydesignpatterns.org.
– MP5: it is inspired by the “composite” correspondence pattern. It consists of a
class-to-class equivalence correspondence and a property-to-property equivalence
correspondence, where classes from the first correspondence are in the domain or
in the range of properties from the second correspondence.
– MP6: it is inspired by the “attribute to relation” correspondence pattern where a
datatype and an object property are aligned as an equivalence correspondence.
– MP7: it is the variant of the MP5 “composite pattern”. It consists of an equivalence
correspondence between two classes and an equivalence correspondence between
two properties, where one class from the first correspondence is in the domain and
the other class from that correspondence is in the range of equivalent properties,
except the case where domain and range is the same class.
– MP8: it consists of an equivalence correspondence between A and B and an
equivalence correspondence between a child of A and a parent of B where A and B are
from different ontologies. It is sometimes referred to as criss-cross pattern.
– MP9: it is the variant of MP3, where the two sibling classes C and D are disjoint.</p>
        <p>MP4, MP5, and MP6 are inspired by correspondence patterns from [21]. In
principle, it is not possible to tell which mapping pattern is desirable or not desirable. This
must be decided on the basis of an application context or possible alternatives.
However, we could roughly say that while MP2 and MP5 seems to be desirable, MP7, MP8,
and MP9 indicate incorrect correspondences related to inconsistency.</p>
        <p>In Table 12 there are numbers of occurrences of mapping patterns in results of
participants of OAEI 2009. We already see that some patterns are more typical for
some systems than for other. Proper quantification of this relationship as well as its
combination with other characteristics of correspondences is however the task for a
mining tool.</p>
        <p>For the data-mining analysis we employed the 4ft-Miner procedure of the
LISpMiner data mining system10 for mining of association rules. We found several
interesting association hypotheses: t1 to t6 are related to confidence or underlying resources of
ontologies (see Table 13) and m1 to m10 are related to mapping patterns (see Table 14).
In total there were 21117 correspondences in the data matrix. We can interpret some of
these hypotheses as follows:
10 http://lispminer.vse.cz/</p>
        <p>The set of A concepts attached to each book is then used to decide whether these
rules are fired for this book. If the A concept of one rule is contained by the A annotation
of a book, then the rule is fired. As several rules can be fired for a same book, the union
of the consequents of these rules forms the translated B annotation of the book.</p>
        <p>On a set of books selected for evaluation, the generated concepts for a book are then
compared to the ones that are deemed correct for this book. At the annotation level, we
measure the precision, the recall, and the Jaccard overlap measure (Jac.) between the
produced annotation and the correct one.</p>
        <p>In the formulas used, results are counted on a book and annotation basis, and not
on a rule basis. This reflects the importance of different thesaurus concepts: a
translation rule for a frequently used concept is more important than a rule for a rarely used
concept.</p>
        <p>Results. Table 21 shows the results when taking into account all correspondences that
belong to a certain relation selection.</p>
        <p>TaxoMap links evaluated</p>
        <p>LCSH-RAMEAU RAMEAU-SWD</p>
        <p>LCSH-SWD
exactMatch
eM + broadMatch
eM + bM + narrowMatch
all relations</p>
        <p>Table 22 shows the results obtained when selecting only the “best” available
mapping for one concept and discarding the others.</p>
        <p>TaxoMap links evaluated</p>
        <p>LCSH-RAMEAU RAMEAU-SWD</p>
        <p>LCSH-SWD
exactMatch
eM + broadMatch
eM + bM + narrowMatch
all relations</p>
        <p>Prec. Rec. Jac. Prec. Rec. Jac. Prec. Rec. Jac.
22.8 5.8 5.3 14.2 1.9 1.7 1.2 0.002 0.002
10.2 6.0 4.9 6.9 2.0 1.7 – – –
7.2 4.5 3.3 5.9 1.9 1.5 – – –
6.4 4.0 2.9 5.8 1.9 1.5 – – –
The setting for this year’s library task clearly shows the limits of current matching
tools. The case at hand, mostly because of its size and its multilingual aspect, is
extremely difficult to handle. The performance of TaxoMap, from this perspective, should
be regarded as a significant achievement, as it was the only one to manage to ingest
hundreds of concepts and return alignments between them.</p>
        <p>The results of TaxoMap, which could not apply its usual partition approach, and
uses to a great extent automatic translation, are not very good. More precisely, they
are especially weak when relations other than strict equivalence are considered,
highlighting the value of being able to sort mapping results using the type of relation
or the strength of the confidence measure granted to correspondences–options which
are both offered by TaxoMap. Both precision and coverage/recall are low for the
non-equivalence correspondences, even though they bring a huge number of potential
matches. The translation could give better results for the equivalent correspondences, at
the cost of coverage of course.</p>
        <p>It is worth mentioning that as last year, the results for the comparison with a
reference mapping and the re-indexing evaluation largely differ, showing that
correspondences have a different relevance depending on the application scenario.
correspondences based on translation will perform obviously better for scenarios where the
intension of concepts matters, rather than for cases where their actual usage in book
collections should be carefully taken into account.
8</p>
      </sec>
    </sec>
    <sec id="sec-6">
      <title>Oriented alignment</title>
      <p>
        This year we introduced evaluation of alignments containing other relations that the
classical equivalence between entities, e.g., subsumption relations.
The first dataset (dataset 1) has been derived from the benchmark series of the OAEI
2006 campaign [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ] and was created for the evaluation of the "Classification-Based
Learning of Subsumption Relations" (CSR) method. As a configuration of CSR exploits
the properties of concepts (for the cases where properties are used as features), we do
not include the OAEI 2006 ontologies whose concepts have no properties. Furthermore,
we have excluded from the dataset the OAEI ontologies with no defined subsumption
relations among their concepts. This is done because CSR exploits the subsumption
relations in the input ontologies to generate training examples. More specifically, all
benchmarks (101-304) except 301 to 304, define the second ontology of each pair as an
alteration of the same ontology, i.e., the first one, numbered 101.
      </p>
      <p>The second dataset (dataset 2) is composed of 45 pairs of real-world ontologies
coming from the Consensus Workshop track of the OAEI 2006 campaign (all pairwaise
combinations). The domain of the ontologies concerns the organization of conferences
and they have been developed within the OntoFarm project7.</p>
      <p>The reference alignment for all datasets has been manually created by knowledge
engineers. The major guidelines that were followed for the location of subsumption
relations are as follows: (a) use existing equivalences in order to find inferred
subsumptions, and (b) understand the "intended meaning" of the concepts, e.g., by inspecting
specifications and relevant information attached to them. The format of the reference
alignment is the Alignment format as used in the benchmark series.
Three systems returned results for the first dataset, namely, ASMOV, RiMoM and
TaxoMap. We present these results by also presenting the results achieved by CSR (as a
comparison basis), presenting also the results of CSR for the second dataset.
system
test
1xx
2xx
3xx</p>
      <p>CSR</p>
      <p>ASMOV</p>
      <p>RiMoM
For the first time in OAEI, an instance matching track was proposed to participants.
The aim of this track is to evaluate matchers on instance data coming from diverse
sources. Both data extracted from published Web datasets, and a testbed presenting
various automatically generated values and structure modifications were proposed.
The AKT-Rexa-DBLP (ARS) test case aims at testing the capability of the tools to
match individuals. All three datasets were structured using the same schema. The
challenges for the matchers included ambiguous labels (person names and paper titles) and
noisy data (some sources contained incorrect information).</p>
      <p>Ontology pair</p>
      <p>Prec. Rec. Ontology pair</p>
      <p>Prec. Rec.</p>
      <p>Test set The test case included three datasets from the domain of scientific publications:
– AKT EPrints archive14. This dataset contains information about papers produced
within the AKT research project.
– Rexa dataset15. This dataset was extracted from the Rexa search server, which was
constructed at the University of Massachusetts using automatic information
extraction algorithms.
– SWETO DBLP dataset16. This is a publicly available dataset listing publications
from the computer science domain.</p>
      <p>The SWETO-DBLP dataset was originally represented in RDF. Two other datasets
(AKT EPrints and Rexa) were extracted from the HTML sources using specially
constructed wrappers and structured according to the SWETO-DBLP ontology17. The
ontology describes information about scientific publications and their authors and extends
the commonly used FOAF ontology18. Authors are represented as individuals of the
14 http://eprints.aktors.org/
15 http://www.rexa.info/
16 http://lsdis.cs.uga.edu/projects/semdis/swetodblp/
17 http://lsdis.cs.uga.edu/projects/semdis/swetodblp/august2007/opus_august2007.rdf
18 http://xmlns.com/foaf/spec/
foaf:Person class, and a special class sweto:Publication is defined for publications, with
two subclasses sweto:Article and sweto:Article_in_Proceedings for journal and
conference publications respectively. The participants were invited to produce alignments for
each pair of datasets (AKT/Rexa, AKT/DBLP, and Rexa/DBLP).</p>
      <p>Evaluation results Five participants submitted results for the AKT-Rexa-DBLP test
case produced by their systems: DSSim, RiMOM, FBEM, HMatch, and ASMOV. The
results were evaluated by comparing them with a manually constructed reference
alignment and calculating the standard precision, recall, and F-measure. We measured the
performance of each system for the classes sweto:Publication and foaf:Person
separately, as well as for the combined set of individuals. These evaluation results are
provided in Table 25.</p>
      <p>sweto:Publication foaf:Person Overall
Prec. Rec. FMeas. Prec. Rec. FMeas. Prec. Rec. FMeas.</p>
      <p>The AKT/Rexa test scenario was the only one for which the results for ASMOV
were available and the only one for which all the systems provided alignments for
both foaf:Person and sweto:Publication classes. FBEM for the AKT/DBLP test case
only produced alignments for Publication instances, which reduced their overall recall.
For the class Publication the best F-measure in all three cases was achieved by
RiMOM with HMatch being the second. FBEM, which specifically focused on precision,
achieved the highest precision in all three cases at the expense of recall. It is interesting
to see the difference between systems in the Rexa/DBLP scenario where many distinct
individuals had identical titles, e.g., “Editorial.”, or “Minitrack Introduction.”. This
primarily affected the precision in the case of HMatch and RiMOM, but reduced recall for
FBEM.</p>
      <p>The performance of all systems was lower for the class Person where ambiguous
personal names and different label formats reduced the performance of string similarity
techniques. The highest F-measure was achieved by RiMOM and by HMatch for the
three test cases. Again, it is interesting to note the difference between RiMOM, HMatch,
and FBEM in the Rexa/DBLP case where the first two systems focused on F-measure
and the second one on precision. This distinction of approaches can be an important
criterion when a tool has to be selected for a real world use case: in some cases the
cost of an erroneous correspondence is much higher than than the cost of a missed one,
e.g., the large-scale entity naming service such as FBEM, while in other scenarios this
might not be true, e.g., assisting the user who performs manual alignment of datasets.
In contrast, in the AKT/Rexa scenario the performance of FBEM was lower than the
performance of other systems both in terms of precision and recall. This was caused
by different label formats used by AKT and Rexa datasets (“FirstName LastName” vs
“LastName, FirstName”), which affected FBEM.</p>
      <p>Because in all three scenarios the datasets had more Person individuals than
Publication ones, the overall results were primarily influenced by the performance of the
tools on the class Person. Again, HMatch and RiMOM had the highest F-measure for
all the test cases. We can see a comparison with respect to F-measure in Figure 11.
1
0</p>
      <p>DSSim
RiMOM
FBEM
HMatch</p>
      <p>ASMOV</p>
      <p>AKT/Rexa AKT/DBLP Rexa/DBLP</p>
      <p>Fig. 11. Comparison on AKT-Rexa-DBLP with respect of FMeasure</p>
      <sec id="sec-6-1">
        <title>9.2 ISLab Instance Matching Benchmark</title>
        <p>The ISLab Instance Matching Benchmark (IIMB) is a benchmark automatically
generated starting from one data source that is automatically modified according to various
criteria. The original data source contains OWL/RDF data about actors, sport persons,
and business firms provided by the OKKAM European project19. The benchmark is
composed by 37 test cases. For each test case we require participants to match the
original data source against a new data source. The original data source contains about 200
different instances. Each test case contains a modified version of the original data source
and the corresponding reference alignment containing the expected results.
Modifications introduced in IIMB are the following:
19 http://www.okkam.org
– Test case 001: Contains an identical copy of the original data source (instance IDs
are randomly changed).
– Test case 002 - Test case 010: Value transformations, i.e., typographical errors
simulation, use of different standard for representing the same information. In order to
simulate typographical errors, property values of each instance are randomly
modified. Modifications are applied on different subsets of the instances property values
and with different levels of difficulty, i.e., introducing a different number of errors.
– Test case 011 - Test case 019: Structural transformations, i.e., deletion of one or
more values, transformation of datatype properties into object properties, separation
of a single property into more properties.
– Test case 020 - Test case 029: Logical transformations, i.e., instantiation of identical
individuals into different subclasses of the same class, instantiation of identical
individuals into disjoint classes, instantiation of identical individuals into different
classes of an explicitly declared class hierarchy.</p>
        <p>– Test case 030 - Test case 037: Several combinations of the previous transformations.
Evaluation results. In this first edition of the instance matching track, six systems
participated in the IIMB task, namely AFlood, ASMOV, DSSim, HMatch, FBEM, and
RiMOM. In Table 26, we provide real precision and recall measures for the participating
systems.</p>
        <p>System</p>
        <p>Test</p>
        <p>AFlood ASMOV DSSim</p>
        <p>Prec. Rec. FMeas. Prec. Rec. FMeas. Prec. Rec. FMeas.
002 - 010 1.00 0.99
011 - 019 0.90 0.72
020 - 029 0.85 1.00
030 - 037 0.94 0.75
H-means 0.92 0.87
002 - 010 0.97 0.98
011 - 019 0.88 0.83
020 - 029 0.78 1.00
030 - 037 0.94 0.89
H-means 0.89 0.93</p>
        <p>A first general remark about the results is that three of the participating systems,
i.e., AFlood, ASMOV, and DSSim, provide better results in terms of precision rather
than in terms of recall, even if AFlood and ASMOV results can be considered very
good in both. On the other end, HMatch, FBEM, and RiMOM provide better results
in terms of recall, with better performances in case of HMatch and RiMOM. Coming
to the four categories of test cases, we can conclude that all the six systems show very
good performances on cases 002 - 010, where we just introduced some data errors by
maintaining both the data structure and the logical properties of data. On test cases 011
- 019, where data structures were changed by deleting or modifying property assertions,
AFlood, ASMOV, HMatch, and RiMOM still perform over 80% in terms of F-Measure,
while both DSSim and FBEM performances are lower, especially with respect to recall.
In general, test cases 011 - 019 were more difficult with respect to recall than to
precision. Test cases 020 - 029 were focused on logical transformations. In order to achieve
good performances here, it is important to take into account logical implications of the
schema over the instances. This is achieved by AFlood, ASMOV, DSSim, and RiMOM.
HMatch maintains high recall and good precision, while FBEM’s precision seems very
low. Finally, test cases 030 - 037 as well as the final harmonic mean shown, AFlood,
ASMOV, HMatch, and RiMOM provide good results both in terms of precision and in
terms of recall. DSSim is more effective on precision, while FBEM is stronger in terms
of recall.</p>
        <p>1
0.9
0.8
0.7
n0.6
o
i
s
i
c
re0.5
P
0.4
Fig. 12. Precision/recall graphs. They cut the results given by the participants under a threshold
necessary for achieving n% recall and compute the corresponding precision.</p>
        <p>All the six systems provided their results with confidence measures. It is thus
possible to draw precision/recall graphs in order to compare them (see Figure 12). The graph
is computed by averaging the graphs of each of the tests. The precision/recall graph
confirms the comparison done over real precision and recall values, especially in case
of recall values lower than 50%. After that threshold, ASMOV, RiMOM, and HMatch
maintain their performances high, and FBEM performances are stable. Instead, DSSim
and AFlood values of precision decrease quite quickly, even if AFlood performances
are still better than FBEM and DSSim.
10</p>
      </sec>
    </sec>
    <sec id="sec-7">
      <title>Very Large Crosslingual Resources</title>
      <p>The goal of the Very Large Crosslingual Resources challenge is twofold. First, we are
interested in matching vocabularies in different languages. Many collections throughout
Europe are indexed with vocabularies in languages other than English. These collections
would benefit from an alignment to resources in other languages to broaden the user
group, and possibly enable integrated access to the different collections. Second, we
intend to present a realistic use case in the sense that the resources are large, rich in
semantics but weak in formal structure, i.e. realistic on the Web. For collections indexed
with an in-house vocabulary, the link to a widely-used and rich resource can enhance
the structure and increase the scope of the in-house thesaurus. In this task, we aim for
skos:exactMatch and skos:closeMatch relations.
Three resources are used in this task:
WordNet WordNet is a lexical database of the English language developed at Princeton
University20. Its main building blocks are synsets: groups of words with a
synonymous meaning. In this task, the goal is to match noun-synsets. WordNet contains 7
types of relations between noun-synsets, but the main hierarchy in WordNet is built
on hyponym relations, which are similar to subclass relations. W3C has translated
WordNet version 2.0 into RDF/OWL.</p>
      <p>The original WordNet model is a rich and well-designed model. However, some
tools may have problems with the fact that the synsets are instances rather than
classes. Therefore, for the purpose of this OAEI task, we have translated the
hyponym hierarchy in a skos:broader hierarchy, making the synsets skos:Concepts.
DBpedia DBpedia contains 2.18 million resources or “things”, each tied to an article in
the English language Wikipedia. The “things” are described by titles and abstracts
in English and often also in Dutch. DBpedia “things” have numerous properties,
such as categories, properties derived from the wikipedia “infoboxes”, links
between pages within and outside wikipedia, etc.</p>
      <p>GTAA The GTAA is a Dutch thesaurus used by the Netherlands Institute for Sound
and Vision to index their collection of TV programs. It is a facetted thesaurus, of
which we use the following four facets: (1) Subject: the topic of a TV program,
20 http://wordnet.princeton.edu/</p>
      <p>3800 terms; (2) People: the main people mentioned in a TV program, 97.000
terms; Names: the main “Named Entities” mentioned in a TV program
(Corporation names, music bands, etc.), 27.000 terms; Location: the main locations
mentioned in a TV program or the place where it has been created, 14.000 terms.
The purpose of this task is to match GTAA concepts to DBpedia “things” and WordNet
synsets.
We evaluate the results of the two alignments (GTAA-WordNet, GTAA-DBpedia) in
terms of precision and recall. Aside from an overall measure, we also present measures
for each GTAA facet separately. We introduce an evaluation on a 3-point scale of 0
0.5 - 1. We assign 1 point when the relation between two concepts is correctly identified
as a skos:exactMatch or a skos:closeMatch. We assign 0.5 points if the proposed relation
is skos:exactMatch while we consider the relation to be skos:closeMatch, or vice versa.
Correspondences between concepts that are not related get 0 points. The scores are used
to generate generalized precision and recall figures.</p>
      <p>Precision For each participant, we take samples of between 71 and 97 correspondences
per GTAA facet for both the GTAA-DBpedia and the GTAA-WordNet alignments and
evaluate their correctness in terms of exact match, close match, or no match.
Recall Due to time constraints, we only determined recall of the GTAA Subject facet.
We use a small reference alignment from a random sample of 100 GTAA concepts,
which we manually mapped to WordNet and DBpedia for the VLCR evaluation of
2008. The result of the GTAA-WordNet and GTAA-DBpedia alignments are compared
to the reference alignments.</p>
      <p>Inter-rater agreement A team of 4 raters rated random samples of DSSim’s
correspondences. A team of 3 raters rated the GG2WW correspondences, where each
alignment was divided over two raters. One rater was a member of both teams.</p>
      <p>In order to check the inter-rater agreement, 100 correspondences were rated by two
raters. The agreement was high with a Cohen’s kappa of 0.87. In addition, we compared
this year’s evaluation samples with those of 2008. 120 correspondences appeared in
both sets, and again the agreement between the scores was high; Cohen’s kappa was
0.92.
10.3
Two teams participated to the OAEI VLCR task: DSSim and GG2WW. Table 27 shows
the number of concepts in each resource and the number of correspondences returned
for each resource pair. Both participants produced only exact matches. After consulting
the participants, we have considered using the confidence measures as an indication of
the strenght of the mapping: a mapping with a confidence measure of 1 was seen as an
exact match and a mapping with a confidence measure &lt; 1 was seen as a close match.
However, this idea lead to lower precision values for both participants and was therefore
abandoned. All correspondences in Table 27 are considered to be exact matches.</p>
      <p>GTAA facet #concepts #corresp. DSSim #corresp. GG2WW</p>
      <p>to WN to DBp to WN to DBp
Subject
People
Names
Locations
Total</p>
      <p>Regarding precision, GG2WW scores consistently better than DSSim on the
GTAADBpedia alignments. Both systems show a similar pattern when comparing the scores
of the four GTAA facets: the scores of the Location facet are highest, followed by the
Person, Subject and finally the Name facet. DSSim scores best on the GTAA-WordNet
alignments, although a comparison is limited since GG2WW only returned
correspondences to the GTAA Subject facet.</p>
      <p>DSSim has participated in the VLCR task of 2008 as well. However, a direct
comparison of the precision scores of 2008 and 2009 is difficult due to differences in the
task; in 2008 we considered SKOS exact-, broad-, narrow- and related-matches. The
results of 2008 and 2009 do show similarities when comparing the scores of the facets
and resources. The GTAA Names facet remains hard to match, which might be due to
the many Dutch-specific concepts in this facet, such as Dutch ships named after famous
people. WordNet appears again to be less compatible with the GTAA facets, with the
exception of the Subject facet.</p>
      <p>Recall measures can be compared to last year directly, as we have used the same
evaluation measures and reference alignment. DSSim scores exactly the same on the
GTAA-WordNet mapping (0.19) and higher on the GTAA-DBpedia mapping (from
0.22 to 0.30). GG2WW produced 50% more correspondences between GTAA-WordNet
and 300% more correspondences between GTAA-DBpedia than DSSIM (Table 27).
This translates to a recall score that is 3 and 2 times as high as the DSSim scores.
11
This year we performed analyses of the extent to which particular alignments preserved
the structure between two ontologies, or more specifically, between two class
hierarchies [15; 5]. Here we provide a brief summary of the approach and presentation of the
results.</p>
      <p>We wish to measure the smoothness of such an alignment, while recognizing that
being a smooth mapping is neither necessary nor sufficient to be a good mapping.
Nonetheless a strong correlation of smoothness with precision, recall or F-measure
promises a potentially automatic predictor of alignment quality independent of a
reference alignment. Additionally, knowledge of the structural properties of alignments is
useful for ontology matchers, especially when providing alignments within one domain
where structural preservation is desired.</p>
      <p>
        An alignment is modeled as a relation between two semantic hierarchies, modeled
as partially ordered sets [
        <xref ref-type="bibr" rid="ref6">6</xref>
        ]. Such ordered structures are not, in general, trees, nor even
lattices, but can be rich in multiple inheritance and lack unique least common subsumers
between nodes.
      </p>
      <p>Let a semantic hierarchy be a bounded partially ordered set (poset) P = hP; i,
where P is a finite set of ontology nodes, and P 2 is a reflexive, anti-symmetric,
and transitive binary relation such as subsumption (“is-a”). For two taxonomies P =
hP; i ; P0 = hP 0; 0i, an alignment relation F P P 0 is a collection of pairs
f = ha; a0i 2 F , indicating that the node a 2 P on the “left” side is mapped or aligned
to the node a0 2 P 0 on the “right” side. F determines a domain and codomain
P;</p>
      <p>Q0 = fa0 2 P 0; 9a 2 P; ha; a0i 2 F g
Q = fa 2 P; 9a0 2 P 0; ha; a0i 2 F g</p>
      <p>We call the f 2 F links, the a 2 Q the left anchors and the a0 2 Q0 the right
anchors. Let m = jQj; m0 = jQ0j, and N = jF j mm0.</p>
      <p>Our approach is not a relative measure of an alignment with respect to a reference
alignment, but rather an inherent or independent measure of the alignment based on the
following principles:
Twist, or order discrepancy: a; b should have the same structural relations in P as
a0; b0 in P0
Stretch, or distance discrepancy: Relative distance between a; b 2 P should be the
same as a0; b0 2 P 0</p>
      <p>Let d be a metric on P and P0. For links f = ha; a0i ; g = hb; b0i 2 F , we want the
metric relations between the a; b 2 Q to be the same as their corresponding a0; b0 2 Q0,
so that jd(a; b) d0(a0; b0)j is small. In this work, we use the upper and lower
cardinalitybased distances:
du(a; b) = j " aj + j " bj
2 cm2aa_xb j " cj;
dl(a; b) = j # aj + j # bj
2 cm2aa^xb j # cj;</p>
      <p>P 0;
where for a node a 2 P , its upset " a = fxjx ag and downset # a = fxjx ag
are all its ancestors and successors respectively, so that j " aj; j # aj are the number of
ancestors and successors. The generalized join and meet are
a _ b = Min(" a
" b)</p>
      <p>P;
a ^ b = Max(# a
# b)</p>
      <p>P;
Z</p>
      <p>Z
where for a set of nodes R</p>
      <p>P the upper bounds and lower bounds are
Min(R) = fa 2 R : 6 9b 2 R; b &lt; ag</p>
      <p>P; Max(R) = fa 2 R : 6 9b 2 R; b &gt; ag
P:</p>
      <p>We need to measure the relative proportion of the overall structure two nodes are
apart, so define the normalized upper and lower distances as:
du(a; b) =
jP j
dl(a; b) =
jP j
D(F ) 2 [0; 1], with D(F ) = 0 iff F is completely distance preserving, and D = 1 if F
is maximally distance distorting, e.g. mapping diameters to equality, and neighbors and
children to diameters. We also calculate the order discrepancy of each alignment as:
(F ) =</p>
      <p>P
f;g2F</p>
      <p>N
2
(f ; g)
2 f&lt;; &gt;; =; 6 g (6 denoting non
(f ; g) =
0; if a b and a0 b0
1; otherwise</p>
      <p>Hence D(F ) measures the “stretching” of F , (F ) measures “twisting”, or the
number of purely structural violations present.</p>
      <p>Figure 13, Figure 14, and Figure 15 show scatter plots of D(F ) against precision for
all the 1xx, 2xx, and 3xx tests for the benchmark track, respectively. We see a moderate
trend of decreasing precision with increasing D(F ), with Pearson correlation
coefficients of r = 0:65 and r = 0:51 respectively. Table 29 shows the correlation r
for D(F ) and (F ) against precision, recall, and F-measure for all tracks, and all 1xx,
2xx, and 3xx tracks grouped together.</p>
      <p>For more details on a particular track, Table 30 shows the results from Test 205
from Benchmark. We can see in this case a particular strong dropoff in precision with
increasing discrepancy, with r = 0:92.</p>
      <p>Table 31 shows the results for the anatomy track. Scatter plots are shown in
Figure 16 for all tests. Table 32 summarizes the correlations, combining all tests, and then</p>
      <p>Fig. 15. Precision vs. D(F ), Benchmark track: 3xx tests.</p>
      <p>r</p>
      <p>D(F ) Prec. Rec. FMeas.
broken out by test. Again, we see a strong correlation of increasing D(F ) against
especially decreasing precision. Note the outlier point, corresponding to Taxomap in test 3
with D(F ) = 0:00145. If this point is excluded, then among all tests we obtain r values
of 0:84 for precision, 0:05 for recall, and 0:61 for F-measure.</p>
      <p>
        These preliminary results are clearly in need of further analysis, which we are now
embarking on. Some early comments include:
– These results are consistent with those shown in [
        <xref ref-type="bibr" rid="ref5">5</xref>
        ], which showed a moderate
correlation of D(F ) with F-measure.
– Pearson correlation, the only measure here, is a weak indicator, but suggestive that
our lower distance discrepancy may act as a predictor of precision.
– Here only the lower distance dl(a; b) and distance discrepancy D(F ) were used.
      </p>
      <p>Further consideration is also required of the role the upper distance du(a; b) and
the order discrepancy (F ).</p>
      <p>D(F )</p>
      <p>(F ) Prec. Rec. FMeas.</p>
      <p>r
The lessons learned for this year are relatively similar to those of previous years. There
remain one lesson not really taken into account that we identify with an asterisk (*). We
reiterate those lessons that still apply with new ones:
A) Unfortunately, we have not been able to maintain the better schedule of two years
ago. We hope to be able to improve this through the use of SEALS technology (see
§13).</p>
      <p>B) The trend that there are more matching systems able to enter such an evaluation
seems to slow down. There have been not many new systems this year but on
specialised topics. There can be two explanations: the field is shrinking or the entry
ticket is too high.</p>
      <p>C) We still can confirm that systems that enter the campaign for several times tend to
improve over years.
*D) The benchmark test case is not discriminant enough between systems and, as noted
last year, automatic test generation could contribute to improve the situation. We
plan to introduce this in the SEALS platform.</p>
      <p>E) Some tracks provide non conclusive results, we should make effort to improve this
situation by knowing, beforehand, what conclusions can be drawn from the
evaluations.</p>
      <p>F) With the increase in the number of data sets, comes less participants. We will have
to set rules for declaring unfruitful, tracks in which there is no minimal independent
participation.</p>
      <p>Of course, these are only suggestions that will be refined during the coming year, see
[22] for a detailed discussion on the ontology matching challenges.
13</p>
    </sec>
    <sec id="sec-8">
      <title>Future plans</title>
      <p>In order to improve the organization of the Ontology Alignment Evaluation Initiative,
plans are made for next year that the evaluation campaign be run on a new open platform
for semantic technology evaluation developed by the SEALS project21. The SEALS
project aims at providing support for the evaluation of semantic technologies, including
ontology matching.</p>
      <p>The project will provide an automated test infrastructure and will organize
integrated evaluation campaigns. This will allow new features in tests cases like test
generation on demand and online evaluation. This will lead to a more automated and
integrated way to evaluate systems as well as the opportunity for participants to run the
evaluation for themselves.</p>
      <p>We plan to run the next OAEI campaign within this framework and to have at least
three tracks, and if possible more, fully supported by the SEALS platform.
21 http://www.seals-project.eu
Confirming the trend of last year, the number of systems, and tracks they enter in, seems
to stabilize. As noticed the previous years, systems which do not enter for the first
time are those which perform better. This shows that, as expected, the field of ontology
matching is getting stronger (and we hope that evaluation has been contributing to this
progress).</p>
      <p>Moreover, we had this year more tracks but participants did not enter more tracks
than previous years: 3.25 against 3.84 in 2008 and 2.94 in 2007. This figure of around
3 out of 8 may be the result of either the specialization of systems or the short time
allowed to the campaign.</p>
      <p>All participants have provided a description of their systems and their experience in
the evaluation. These OAEI papers, like the present one, have not been peer reviewed.
However, they are full contributions to this evaluation exercise and reflect the hard work
and clever insight people put in the development of participating systems. Reading the
papers of the participants should help people involved in ontology matching to find what
makes these algorithms work and what could be improved. Sometimes participants offer
alternate evaluation results.</p>
      <p>The Ontology Alignment Evaluation Initiative will continue these tests by
improving both test cases and testing methodology for being more accurate. Further
information can be found at:</p>
      <sec id="sec-8-1">
        <title>Acknowledgments</title>
        <p>We warmly thank each participant of this campaign. We know that they have worked
hard for having their results ready and they provided insightful papers presenting their
experience. The best way to learn about the results remains to read the following papers.</p>
        <p>We thank Paolo Bouquet and the OKKAM European Project for providing the
reference alignment for the IIMB benchmark used in the instance matching track.</p>
        <p>We thank Patrice Landry, Genevieve Clavel and Jeroen Hoppenbrouwers for the
MACS data. For LCSH, RAMEAU and SWD, respectively, The Library of Congress,
The French National Library and the German National Library. The collection of the
British Library was provided by the The European Library Office.</p>
        <p>Jérôme Euzenat, Christian Meilicke, Heiner Stuckenschmidt and Cassia Trojahn
dos Santos have been partially supported by the SEALS (IST-2009-238975) European
project.</p>
        <p>We are grateful to Dominique Ritze (University of Mannheim) for participating in
extension of reference alignment for the conference track. In addition, Ondrˇej
ŠvábZamazal and Vojteˇch Svátek were supported by the IGA VSE grant no.20/08
“Evaluation and matching ontologies via patterns”.</p>
        <p>We also warmly thanks Claudio Baldassarre for preparing unfruitful test cases
which were cancelled; we hope to have more success with these in the coming years.</p>
        <p>We are grateful to Martin Ringwald and Terry Hayamizu for providing the reference
alignment for the anatomy ontologies.</p>
        <p>We gratefully acknowledge the Dutch Institute for Sound and Vision for allowing
us to use the GTAA. We would like to thank Willem van Hage for the use of his tools
for manual evaluation of correspondences.</p>
        <p>We also thank the other members of the Ontology Alignment Evaluation
Initiative Steering committee: Wayne Bethea (John Hopkins University, USA), Lewis Hart
(AT&amp;T, USA), Tadashi Hoshiai (Fujitsu, Japan), Todd Hughes (DARPA, USA),
Yannis Kalfoglou (Ricoh laboratories, UK), John Li (Teknowledge, USA), Miklos Nagy
(The Open University (UK), Natasha Noy (Stanford University, USA), Yuzhong Qu
(Southeast University (China), York Sure (Leibniz Gemeinschaft, Germany), Jie Tang
(Tsinghua University (China), Raphaël Troncy (Eurecom, France), and Petko Valtchev
(Université du Québec Montréal, Canada).
12. Daniel Fleischhacker and Heiner Stuckenschmidt. Implementing semantic precision and
recall. In Proc. 4th International Workshop on Ontology Matching (OM-2009), collocated
with ISWC-2009, Chantilly (USA), 2009. this volume.
13. Fausto Giunchiglia, Mikalai Yatskevich, Paolo Avesani, and Pavel Shvaiko. A large scale
dataset for the evaluation of ontology matching systems. The Knowledge Engineering Review
Journal, 24(2):137–157, 2009.
14. Antoine Isaac, Henk Matthezing, Lourens van der Meij, Stefan Schlobach, Shenghui Wang,
and Claus Zinn. Putting ontology alignment in context: Usage scenarios, deployment and
evaluation in a library case. In Proceedings of the 5th European Semantic Web Conference
(ESWC), pages 402–417, Tenerife (ES), 2008.
15. Cliff Joslyn, Alex Donaldson, and Patrick Paulson. Evaluating the structural quality of
semantic hierarchy alignments. In International Semantic Web Conference (Posters &amp; Demos),
Karlsruhe (Germany), 2008.
16. Patrick Lambrix and Qiang Liu. Using partial reference alignments to align ontologies.</p>
        <p>In Proceedings of the 6th European Semantic Web Conference, pages 188–202, Heraklion,
Crete (Greece), 2009.
17. Christian Meilicke and Heiner Stuckenschmidt. Incoherence as a basis for measuring the
quality of ontology mappings. In Proc. 3rd International Workshop on Ontology Matching
(OM-2008), collocated with ISWC-2008, pages 1–12, Karlsruhe (Germany), 2008.
18. Christian Meilicke and Heiner Stuckenschmidt. An efficient method for computing a local
optimal alignment diagnosis. Technical report, University Mannheim, Computer Science
Institute, 2009.
19. Vojteˇch Svátek Ondrˇej Šváb-Zamazal O. Empirical knowledge discovery over ontology
matching results. In Proc. 1st ESWC International Workshop on Inductive Reasoning and
Machine Learning on the Semantic Web, Heraklion (Greece), 2009.
20. Marta Sabou, Mathieu d’Aquin, and Enrico Motta. Using the semantic web as background
knowledge for ontology mapping. In Proc. 1st International Workshop on Ontology
Matching (OM-2006), collocated ISWC-2006, pages 1–12, Athens, Georgia (USA), 2006.
21. Francois Scharffe. Correspondence Patterns Representation. PhD thesis, University of
Innsbruck, 2009.
22. Pavel Shvaiko and Jérôme Euzenat. Ten challenges for ontology matching. In Proceedings of
the 7th International Conference on Ontologies, DataBases, and Applications of Semantics
(ODBASE), pages 1164–1182, Monterrey (MX), 2008.
23. York Sure, Oscar Corcho, Jérôme Euzenat, and Todd Hughes, editors. Proceedings of the</p>
        <p>ISWC Workshop on Evaluation of Ontology-based Tools (EON), Hiroshima (JP), 2004.
24. Willem Robert van Hage, Antoine Isaac, and Zharko Aleksovski. Sample evaluation of
ontology-matching systems. In Proc. 5th International Workshop on Evaluation of
Ontologies and Ontology-based Tools (EON 2007), collocated with ISWC-2007, pages 41–50,
Busan (Korea), 2007.</p>
      </sec>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          1.
          <string-name>
            <given-names>Zharko</given-names>
            <surname>Aleksovski</surname>
          </string-name>
          , Warner ten Kate, and Frank van Harmelen.
          <article-title>Exploiting the structure of background knowledge used in ontology matching</article-title>
          .
          <source>In Proc. 1st International Workshop on Ontology Matching (OM-2006), collocated with ISWC-2006</source>
          , Athens, Georgia (USA),
          <year>2006</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          2.
          <string-name>
            <given-names>Ben</given-names>
            <surname>Ashpole</surname>
          </string-name>
          , Marc Ehrig, Jérôme Euzenat, and Heiner Stuckenschmidt, editors.
          <source>Proceedings of the K-Cap Workshop on Integrating Ontologies</source>
          , Banff (CA),
          <year>2005</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          3.
          <string-name>
            <given-names>Oliver</given-names>
            <surname>Bodenreider</surname>
          </string-name>
          , Terry Hayamizu, Martin Ringwald, Sherri De Coronado, and Songmao Zhang.
          <article-title>Of mice and men: Aligning mouse and human anatomies</article-title>
          .
          <source>In Proc. American</source>
          Medical Informatics
          <string-name>
            <surname>Association (AIMA) Annual</surname>
            <given-names>Symposium</given-names>
          </string-name>
          , pages
          <fpage>61</fpage>
          -
          <lpage>65</lpage>
          ,
          <year>2005</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          4.
          <string-name>
            <given-names>Caterina</given-names>
            <surname>Caracciolo</surname>
          </string-name>
          , Jérôme Euzenat, Laura Hollink, Ryutaro Ichise, Antoine Isaac, Véronique Malaisé, Christian Meilicke, Juan Pane, Pavel Shvaiko, Heiner Stuckenschmidt,
          <article-title>Ondrej Sváb-Zamazal, and Vojtech Svátek. Results of the ontology alignment evaluation initiative 2008</article-title>
          .
          <source>In Proc. 3rd International Workshop on Ontology Matching (OM-2008)</source>
          <article-title>, collocated with ISWC-2008</article-title>
          , Karlsruhe (Germany),
          <year>2008</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          5.
          <string-name>
            <given-names>Joslyn</given-names>
            <surname>Cliff</surname>
          </string-name>
          , Paulson Patrick,
          <string-name>
            <given-names>and White</given-names>
            <surname>Amanda</surname>
          </string-name>
          .
          <article-title>Measuring the structural preservation of semantic hierarchy alignments</article-title>
          .
          <source>In Proc. 4th International Workshop on Ontology Matching (OM-2009)</source>
          <article-title>, collocated with ISWC-2009, Chantilly</article-title>
          (USA),
          <year>2009</year>
          . this volume.
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          6.
          <string-name>
            <given-names>Brian</given-names>
            <surname>Davey</surname>
          </string-name>
          and
          <string-name>
            <given-names>Hilary</given-names>
            <surname>Priestly</surname>
          </string-name>
          .
          <article-title>Introduction to lattices and order</article-title>
          . Cambridge University Press, Cambridge, 2nd edition,
          <year>1990</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          7.
          <string-name>
            <given-names>Marc</given-names>
            <surname>Ehrig</surname>
          </string-name>
          and
          <string-name>
            <given-names>Jérôme</given-names>
            <surname>Euzenat</surname>
          </string-name>
          .
          <article-title>Relaxed precision and recall for ontology matching</article-title>
          .
          <source>In Proceedings of the K-Cap Workshop on Integrating Ontologies</source>
          , pages
          <fpage>25</fpage>
          -
          <lpage>32</lpage>
          , Banff (CA),
          <year>2005</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          8.
          <string-name>
            <given-names>Jérôme</given-names>
            <surname>Euzenat</surname>
          </string-name>
          .
          <article-title>An API for ontology alignment</article-title>
          .
          <source>In Proceedings of the 3rd International Semantic Web Conference (ISWC)</source>
          , pages
          <fpage>698</fpage>
          -
          <lpage>712</lpage>
          , Hiroshima (JP),
          <year>2004</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          9.
          <string-name>
            <given-names>Jérôme</given-names>
            <surname>Euzenat</surname>
          </string-name>
          , Malgorzata Mochol, Pavel Shvaiko, Heiner Stuckenschmidt, Ondrej Svab, Vojtech Svatek, Willem Robert van Hage,
          <string-name>
            <given-names>and Mikalai</given-names>
            <surname>Yatskevich</surname>
          </string-name>
          .
          <article-title>Results of the ontology alignment evaluation initiative 2006</article-title>
          .
          <source>In Proc. 1st International Workshop on Ontology Matching (OM-2006), collocated with ISWC-2006</source>
          , pages
          <fpage>73</fpage>
          -
          <lpage>95</lpage>
          , Athens, Georgia (USA),
          <year>2006</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          10.
          <string-name>
            <given-names>Jérôme</given-names>
            <surname>Euzenat</surname>
          </string-name>
          and
          <string-name>
            <given-names>Pavel</given-names>
            <surname>Shvaiko</surname>
          </string-name>
          .
          <source>Ontology Matching</source>
          . Springer, Heidelberg (DE),
          <year>2007</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          11. Jé´rôme Euzenat, Antoine Isaac, Christian Meilicke, Pavel Shvaiko, Heiner Stuckenschmidt, Ondrej Svab, Vojtech Svatek, Willem Robert van Hage,
          <string-name>
            <given-names>and Mikalai</given-names>
            <surname>Yatskevich</surname>
          </string-name>
          .
          <article-title>Results of the ontology alignment evaluation initiative 2007</article-title>
          .
          <source>In Proc. 2nd International Workshop on Ontology Matching (OM-2008), collocated with ISWC-2007</source>
          , pages
          <fpage>96</fpage>
          -
          <lpage>132</lpage>
          ,
          <string-name>
            <surname>Busan</surname>
          </string-name>
          (Korea),
          <year>2007</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>