<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Results of the Ontology Alignment Evaluation Initiative 2017?</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Manel Achichi</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Michelle Cheatham</string-name>
          <email>michelle.cheatham@wright.edu</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Zlatan Dragisic</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Je´roˆme Euzenat</string-name>
          <email>Jerome.Euzenat@inria.fr</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Daniel Faria</string-name>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alfio Ferrara</string-name>
          <xref ref-type="aff" rid="aff12">12</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Giorgos Flouris</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Irini Fundulaki</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ian Harrow</string-name>
          <xref ref-type="aff" rid="aff10">10</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Valentina Ivanova</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ernesto Jime´nez-Ruiz</string-name>
          <email>ernestoj@ifi.uio.no</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Kristian Kolthoff</string-name>
          <xref ref-type="aff" rid="aff14">14</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Elena Kuss</string-name>
          <xref ref-type="aff" rid="aff14">14</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Patrick Lambrix</string-name>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henrik Leopold</string-name>
          <email>h.leopold@vu.nl</email>
          <xref ref-type="aff" rid="aff15">15</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Huanyu Li</string-name>
          <email>huanyu.lig@liu.se</email>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christian Meilicke</string-name>
          <xref ref-type="aff" rid="aff14">14</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Majid Mohammadi</string-name>
          <email>m.mohammadi@tudelft.nl</email>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Stefano Montanelli</string-name>
          <email>stefano.montanellig@unimi.it</email>
          <xref ref-type="aff" rid="aff12">12</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Catia Pesquita</string-name>
          <email>cpesquita@di.fc.ul.pt</email>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tzanina Saveta</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Pavel Shvaiko</string-name>
          <email>pavel.shvaiko@infotn.it</email>
          <xref ref-type="aff" rid="aff11">11</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Andrea Splendiani</string-name>
          <xref ref-type="aff" rid="aff10">10</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Heiner Stuckenschmidt</string-name>
          <email>heinerg@informatik.uni-mannheim.de</email>
          <xref ref-type="aff" rid="aff14">14</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Elodie Thie´blin</string-name>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Konstantin Todorov</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ca´ssia Trojahn</string-name>
          <email>fcassia.trojahng@irit.fr</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ondrˇej Zamazal</string-name>
          <email>ondrej.zamazal@vse.cz</email>
          <xref ref-type="aff" rid="aff13">13</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Data Semantics (DaSe) Laboratory, Wright State University</institution>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Department of Informatics, University of Oslo</institution>
          ,
          <country country="NO">Norway</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Faculty of Technology</institution>
          ,
          <addr-line>Policy, and Management</addr-line>
          ,
          <institution>Technical University of Delft</institution>
          ,
          <country country="NL">Netherlands</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>INRIA &amp; Univ. Grenoble Alpes</institution>
          ,
          <addr-line>Grenoble</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>IRIT &amp; Universite ́ Toulouse II</institution>
          ,
          <addr-line>Toulouse</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Institute of Computer Science-FORTH</institution>
          ,
          <addr-line>Heraklion</addr-line>
          ,
          <country country="GR">Greece</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Instituto Gulbenkian de Cieˆncia</institution>
          ,
          <addr-line>Lisbon</addr-line>
          ,
          <country country="PT">Portugal</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>LASIGE, Faculdade de Cieˆncias, Universidade de Lisboa</institution>
          ,
          <country country="PT">Portugal</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>LIRMM/University of Montpellier</institution>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>Linko ̈ping University &amp; Swedish e-Science Research Center</institution>
          ,
          <addr-line>Linko ̈ping</addr-line>
          ,
          <country country="SE">Sweden</country>
        </aff>
        <aff id="aff10">
          <label>10</label>
          <institution>Pistoia Alliance Inc.</institution>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff11">
          <label>11</label>
          <institution>TasLab</institution>
          ,
          <addr-line>Informatica Trentina, Trento</addr-line>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff12">
          <label>12</label>
          <institution>Universita` degli studi di Milano</institution>
          ,
          <country country="IT">Italy</country>
        </aff>
        <aff id="aff13">
          <label>13</label>
          <institution>University of Economics</institution>
          ,
          <addr-line>Prague</addr-line>
          ,
          <country country="CZ">Czech Republic</country>
        </aff>
        <aff id="aff14">
          <label>14</label>
          <institution>University of Mannheim</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff15">
          <label>15</label>
          <institution>Vrije Universiteit Amsterdam</institution>
          ,
          <country country="NL">Netherlands</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2017</year>
      </pub-date>
      <abstract>
        <p>Ontology matching consists of finding correspondences between semantically related entities of different ontologies. ? Note that the only official results of the campaign are on the OAEI web site.</p>
      </abstract>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>-</title>
      <p>The Ontology Alignment Evaluation Initiative (OAEI) aims at comparing
ontology matching systems on precisely defined test cases. These test cases can be
based on ontologies of different levels of complexity (from simple thesauri to
expressive OWL ontologies) and use different evaluation modalities (e.g., blind
evaluation, open evaluation, or consensus). The OAEI 2017 campaign offered 9
tracks with 23 test cases, and was attended by 21 participants. This paper is an
overall presentation of that campaign.</p>
    </sec>
    <sec id="sec-2">
      <title>1 Introduction</title>
      <p>The Ontology Alignment Evaluation Initiative1 (OAEI) is a coordinated international
initiative, which organizes the evaluation of an increasing number of ontology matching
systems [20, 22]. The main goal of the OAEI is to compare systems and algorithms
openly and on the same basis, in order to allow anyone to draw conclusions about the
best matching strategies. Furthermore, our ambition is that, from such evaluations, tool
developers can improve their systems.</p>
      <p>Two first events were organized in 2004: (i) the Information Interpretation and
Integration Conference (I3CON) held at the NIST Performance Metrics for Intelligent
Systems (PerMIS) workshop and (ii) the Ontology Alignment Contest held at the
Evaluation of Ontology-based Tools (EON) workshop of the annual International Semantic
Web Conference (ISWC) [46]. Then, a unique OAEI campaign occurred in 2005 at the
workshop on Integrating Ontologies held in conjunction with the International
Conference on Knowledge Capture (K-Cap) [5]. From 2006 until the present, the OAEI
campaigns were held at the Ontology Matching workshop, collocated with ISWC [2, 3,
7–9, 13, 16–19, 21], which this year took place in Vienna, Austria2.</p>
      <p>Since 2011, we have been using an environment for automatically processing
evaluations (x2.2) which was developed within the SEALS (Semantic Evaluation At Large
Scale) project3. SEALS provided a software infrastructure for automatically executing
evaluations and evaluation campaigns for typical semantic web tools, including
ontology matching. In the OAEI 2017, a novel evaluation environment called HOBBIT (x10)
was adopted for the novel HOBBIT Link Discovery track. Except for this track, all
systems were executed under the SEALS client in all other tracks. The Benchmark track
was discontinued in this edition of the OAEI.</p>
      <p>This paper synthesizes the 2017 evaluation campaign and introduces the results
provided in the papers of the participants. The remainder of the paper is organised as
follows: in Section 2, we present the overall evaluation methodology that has been used;
Sections 3-11 discuss the settings and the results of each of the test cases; Section 13
overviews lessons learned from the campaign; and finally, Section 14 concludes the
paper.</p>
      <sec id="sec-2-1">
        <title>1 http://oaei.ontologymatching.org 2 http://om2017.ontologymatching.org 3 http://www.seals-project.eu</title>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>General methodology</title>
      <p>We first present the tracks and test cases proposed this year to the OAEI participants
(x2.1). Then, we discuss the resources used by participants to test their systems and the
execution environment used for running the tools (x2.2). Finally, we describe the steps
of the OAEI campaign (x2.3-2.5) and report on the general execution of the campaign
(x2.6).
2.1</p>
      <sec id="sec-3-1">
        <title>Tracks and test cases</title>
        <p>This year’s OAEI campaign consisted of 9 tracks gathering 23 test cases, and different
evaluation modalities:
Expressive Ontology tracks offer alignments between real world ontologies
expressed in OWL:
Anatomy (x3): The anatomy track comprises a single test case consisting of
matching the Adult Mouse Anatomy (2744 classes) and a small fragment of
the NCI Thesaurus (3304 classes) describing the human anatomy. Results are
evaluated automatically against a manually curated reference alignment.
Conference (x4): The conference track comprises a single test case that is a suite
of 21 matching tasks corresponding to the pairwise combination of 7
ontologies describing the domain of organizing conferences. Results are evaluated
automatically against reference alignments in several modalities, and by using
logical reasoning techniques.</p>
        <p>Large biomedical ontologies (x5): The largebio track comprises 6 test cases
involving 3 large and semantically rich biomedical ontologies: FMA,
SNOMEDCT, and NCI Thesaurus. These test cases correspond to the pairwise
combination of these ontologies in two variants: small overlapping fragments, in which
only overlapping sections of the ontologies are matched, and whole ontologies.
The evaluation is based on reference alignments automatically derived from the
UMLS Metathesaurus, with mappings causing logical incoherence flagged so
as not to be taken into account.</p>
        <p>
          Disease &amp; Phenotype (x6): The disease &amp; phenotype track comprises 4 test cases
that involve 6 biomedical ontologies covering the disease and phenotype
domains: HPO versus MP, DOID versus ORDO, HPO versus MeSH, and HPO
versus OMIM. The evaluation has been performed according to (
          <xref ref-type="bibr" rid="ref1">1</xref>
          ) a
consensus alignment generated from those produced by the participating systems, (
          <xref ref-type="bibr" rid="ref2">2</xref>
          )
a set of manually generated mappings, and (
          <xref ref-type="bibr" rid="ref3">3</xref>
          ) a manual assessment of unique
mappings (i.e., mappings that are not suggested by other systems).
        </p>
        <p>Multilingual tracks offer alignments between ontologies in different languages:
Multifarm (x7): The multifarm track is based on a subset of the Conference data
set translated into ten different languages, in addition to their original English:
Arabic, Chinese, Czech, Dutch, French, German, Italian, Portuguese, Russian,
and Spanish. It consists of two test cases: same ontologies, where two versions
of the same ontology in different languages are matched, and different
ontologies, in which two different ontologies in different languages are matched. In
total, 45 language pairings are evaluated, meaning that the same ontologies
test case comprises 315 matching tasks, and the different ontologies test case
comprises 945 matching tasks. Results are evaluated automatically against
reference alignments.</p>
        <p>Interactive tracks provide simulated user interaction to enable the benchmarking of
algorithms designed to make use of it, with respect to both the improvement in the
results and the workload of the user:
Interactive Matching Evaluation (x8): The Interactive track is based on the test
cases from the anatomy and conference tracks. An Oracle, which matching
tools can access programmatically, simulates user feedback by querying the
reference alignment of the test case. The Oracle can generate erroneous
responses at a given rate, to simulate user errors. The evaluation is based on the
same reference alignments, and contemplates the number of user interactions
and the fraction of erroneous responses received by the tool, in addition to the
standard evaluation parameters.</p>
        <p>Instance Matching tracks focus on alignments between ontology instances expressed
in the form of OWL Aboxes:
Instance Matching (x9). The instance track comprises two independent
subtracks:
SYNTHETIC: This sub-track consists of matching instances that are found to
refer to the same real-world entity corresponding to a creative work (that
can be a news item, blog post or programme). It includes two evaluation
modalities, Sandbox and Mainbox, which differ on the number of instances
to match. The evaluation is automatic, based on a reference alignment, and
partially blind – matching tools have access only to the Sandbox reference
alignment.</p>
        <p>DOREMUS: This sub-track consists of matching real world datasets about
classical music artworks from two major French cultural institutions: the
French National Library (BnF) and the Philharmonie de Paris (PP). Both
datasets use the same vocabulary, the DOREMUS model, issued from the
DOREMUS project4. This sub-track comprises two different test cases
called heterogeneities (HT) and false-positives trap (FPT) characterized
by different degrees of heterogeneity in artwork descriptions. The
evaluation is automatic and based on reference alignments.</p>
        <p>HOBBIT Link Discovery (x10). The HOBBIT track aims to deal with link
discovery for spatial data represented as trajectories or traces i.e., sequences of
longitude, latitude pairs. It comprises two test cases: Linking and Spatial. The
Linking test case consists in matching traces that have been modified using
string-based approaches, different date and coordinate formats, and by
addition and/or deletion of intermediate points. In the Spatial test case, the goal is
to identify DE-9IM (Dimensionally Extended nine-Intersection Model)
topological relations between traces: Equals, Disjoint, Touches, Contains/Within,
Covers/CoveredBy, Intersects, Crosses, Overlaps. For each relation, a
different pair of source and target datasets is given to the participants, so the test</p>
        <sec id="sec-3-1-1">
          <title>4 http://www.doremus.org</title>
          <p>case consists of 8 individual matching tasks. In both test cases, two evaluation
modalities, Sandbox and Mainbox, were considered, differing on the number of
instances to match. The evaluation is automatic and based on reference
alignments.</p>
          <p>Process Model Matching (x11): The process model track is concerned with the
application of ontology matching techniques to the problem of matching
process models. It comprises two test cases used in the Process Model Matching
Campaign 2015 [4] which have been converted to an ontological
representation, with process model entities being represented as ontology instances. The
first test case contains nine process models which represent the application
process for a master program of German universities as well as reference
alignments between all pairs of models. The second test case consists of process
models which describe the process of registering a newborn child in
different countries. The evaluation is automatic, based on reference alignments, and
uses standard precision and recall measures as well as a probabilistic variant
described in [29].
Since 2011, tool developers had to implement a simple interface and to wrap their tools
in a predefined way including all required libraries and resources. A tutorial for tool
wrapping was provided to the participants, describing how to wrap a tool and how to
use the SEALS client to run a full evaluation locally. This client is then executed by the
track organizers to run the evaluation. This approach ensures the reproducibility and
comparability of the results of all systems.
Ontologies to be matched and (where applicable) reference alignments have been
provided in advance during the period between June 1st and July 15th, 2017. This gave
potential participants the occasion to send observations, bug corrections, remarks and
other test cases to the organizers. The goal of this preparatory period is to ensure that
the delivered tests make sense to the participants. The final test base was released on
July 15th, 2017 and did not evolve after that.
2.4</p>
        </sec>
      </sec>
      <sec id="sec-3-2">
        <title>Execution phase</title>
        <p>During the execution phase, participants used their systems to automatically match the
test case ontologies. In most cases, ontologies are described in OWL-DL and serialized
in the RDF/XML format [11]. Participants can self-evaluate their results either by
comparing their output with reference alignments or by using the SEALS client to compute
precision and recall. They can tune their systems with respect to the non blind
evaluation as long as the rules published on the OAEI web site are satisfied. This phase
has been conducted between July 15th and August 31st, 2017, except for the HOBBIT
track which was extended until September 15th, 2017. Like last year, we requested a
mandatory registration of systems and a preliminary evaluation of wrapped systems by
July 31st, to alleviate the burden of debugging systems with respect to issues with the
SEALS client during the Evaluation phase.
2.5</p>
      </sec>
      <sec id="sec-3-3">
        <title>Evaluation phase</title>
        <p>Participants were required to submit their SEALS-wrapped tools by August 31st, 2017,
and their HOBBIT-wrapped tool by September 15th, 2017. Tools were then tested by
the organizers and minor problems were reported to some tool developers, who were
given the opportunity to fix their tools and resubmit them.</p>
        <p>Initial results were provided directly to the participants between September 1st and
October 15th, 2017. The final results for most tracks were published on the respective
pages of the OAEI website by October 15th, although some tracks were delayed.</p>
        <p>The standard evaluation measures are precision, recall and F-measure computed
against the reference alignments. More details on the evaluation are given in the sections
for the test cases.
2.6</p>
      </sec>
      <sec id="sec-3-4">
        <title>Comments on the execution</title>
        <p>Following an initial period of growth, the number of OAEI participants has remained
approximately constant since 2012, at slightly over 20 (see Figure 1). This year was
no exception, as we counted 21 participating systems. Table 2 lists the participants and
the tracks in which they competed. Some matching systems participated with different
variants (DiSMatch and LogMap) whereas others were evaluated with different
configurations, as requested by developers (see test case sections for details).
Confidence pertains to the confidance scores returned by the system, with X indicating that they
are non-boolean; # indicates that the system did not participate in the track; indicates that it
participated fully in the track; and G# indicates that it participated in or completed only part of the
tasks of the track.
3</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>Anatomy</title>
      <p>The anatomy test case confronts matching systems with two fragments of biomedical
ontologies which describe the human anatomy5 and the anatomy of the mouse6. This
data set has been used since 2007 with some improvements over the years [15].
5 http://www.cancer.gov/cancertopics/cancerlibrary/</p>
      <p>terminologyresources/
3.1</p>
      <sec id="sec-4-1">
        <title>Experimental Setting</title>
        <p>We conducted experiments by executing each system in its standard setting and we
compare precision, recall, F-measure and recall+ against a manually curated reference
alignment. Recall+ indicates the amount of detected non-trivial correspondences, i.e.,
correspondence that do not have the same normalized label. The approach that generates
only trivial correspondences is depicted as baseline StringEquiv in the following section.</p>
        <p>We ran the systems on a server with 3.46 GHz (6 cores) and 8GB allocated RAM,
using the SEALS client. However, we changed the way precision and recall are
computed by removing trivial correspondences in the oboInOwl namespace like:
http://...oboInOwl#Synonym = http://...oboInOwl#Synonym
as well as correspondences expressing relations different from equivalence. Thus, the
results generated by the SEALS client vary in some cases by 0.5% compared to the
results presented below. Using the Pellet reasoner we also checked whether the generated
alignment is coherent, i.e., that there are no unsatisfiable classes when the ontologies
are merged with the alignment.
3.2</p>
      </sec>
      <sec id="sec-4-2">
        <title>Results</title>
        <p>In Table 3, we show the results of the 11 participating systems that generated an
alignment, including 3 versions of LogMap. A number of systems participated in the anatomy
track for the first time this year: KEPLER, POMap, SANOM, WikiV2, and YAM-BIO. For
more details, we refer the reader to the papers presenting the systems.</p>
        <p>This year 5 out of 11 systems were able to achieve the alignment task in less than
100 seconds: LogMapLite, LogMap, XMap, AML and YAM-BIO. In 2016 and 2015, there
6 http://www.informatics.jax.org/searches/AMA_form.shtml
were 4 out of 13 systems and 6 out of 15 systems respectively that generated an
alignment in this time frame. As in the last 5 years LogMapLite has the shortest runtime. The
table shows that there is no correlation between the quality of the generated alignment
in terms of precision and recall and the runtime. This result had also been observed in
previous OAEI campaigns.</p>
        <p>The table also shows the results for F-measure, recall+ and the size of alignments.
Regarding F-measure, the top 3 ranked systems AML, YAM-BIO, POMap achieve on
F-measure above 0.93. Among these, AML achieved the highest F-measure (0.943).
All of the long-term participants in the track showed comparable results in terms of
F-measure to their last year’s results and at least as good as the results of the best
systems in OAEI 2007-2010. Regarding recall+, AML, LogMap, LogMapLite showed
similar results to previous years. LogMapBio has a slight increase from 0.728 in 2016 to
0.733 in 2017. XMap decreases a bit from 0.647 to 0.639. Two new participants obtained
good results for recall+, POMap scored 0.824 (second place) followed by YAM-BIO with
0.794 (third place). In terms of the number of correspondences, long-term participants
computed similar numbers of correspondences as last year. AML and LogMap generated
the same number of correspondences, LogMapBio generated 3 more correspondences,
LogMapLite generated 1 more, ALIN generated 6 more and XMap generated 1 less.</p>
        <p>This year, 10 out of 11 systems achieved an F-measure higher than the baseline.
This is a slightly better result than last year when 9 out of 13 surpassed the baseline.
Five systems produced coherent alignments, which is comparable to the last two years
when 7 out of 13 and 5 out of 10 systems achieved this. Two of the three best systems
with respect to F-measure (YAM-BIO and POMap) produced incoherent alignments.
3.3</p>
      </sec>
      <sec id="sec-4-3">
        <title>Conclusions</title>
        <p>The number of systems participating in the anatomy track has varied throughout the
years. This year, it is lower than in the two previous editions, but higher than in 2014.
As noted previously there are newly-joined systems as well as long-term participants.</p>
        <p>The systems that participated in the previous edition in 2016 scored similarly to
their previous results. As last year, the AML system set the top result for anatomy track
with respect to F-measure. Two of the newly-joined systems (YAM-BIO and POMap)
achieved 2nd and 3rd best score in terms of F-measure.
4</p>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>Conference</title>
      <p>4.1</p>
      <sec id="sec-5-1">
        <title>Test data</title>
        <p>The conference test cases require matching several moderately expressive ontologies
from the conference organisation domain.</p>
        <p>The data set consists of 16 ontologies in the domain of organising conferences. These
ontologies were developed within the OntoFarm project7.</p>
        <p>The main features of this test case are:
7 http://owl.vse.cz:8080/ontofarm/
– Generally understandable domain. Most ontology engineers are familiar with
organising conferences. Therefore, they can create their own ontologies as well as
evaluate the alignments among their concepts with enough erudition.
– Independence of ontologies. Ontologies were developed independently and based
on different resources, they thus capture the issues in organising conferences from
different points of view and with different terminologies.
– Relative richness in axioms. Most ontologies were equipped with OWL DL axioms
of various kinds; this opens a way to use semantic matchers.</p>
        <p>Ontologies differ in their numbers of classes and properties, in expressivity, but also
in underlying resources.
4.2</p>
      </sec>
      <sec id="sec-5-2">
        <title>Results</title>
        <p>We performed three kinds of evaluations. First, we provide results in terms of
Fmeasure, comparison with baseline matchers and results of matchers from previous
OAEI editions and precision/recall triangular graph based on sharp reference
alignments. Second, we provide an evaluation based on the uncertain version of the reference
alignment, and finally we also provide an evaluation based on violations of consistency
and conservativity principles.</p>
      </sec>
      <sec id="sec-5-3">
        <title>Evaluation based on sharp reference alignments We evaluated the results of partic</title>
        <p>ipants against blind reference alignments (labelled as rar2).8 This includes all pairwise
combinations between 7 different ontologies, i.e., 21 alignments.</p>
        <p>We have prepared the reference alignments in two steps. First, we have generated
them as a transitive closure computed on the original reference alignments. In order to
obtain a coherent result, conflicting correspondences, i.e., those causing unsatisfiability,
have been manually inspected and incoherency has been resolved by evaluators. The
resulting reference alignments are labelled as ra2. Second, we detected violations of
conservativity using the approach from [44] and resolved them by an evaluator. The
resulting reference alignments are labelled as rar2. As a result, the degree of correctness
and completeness of the new reference alignments is probably slightly better than for
the old one. However, the differences are relatively limited. Whereas the new reference
alignments are not open, the old reference alignments (labeled as ra1 on the conference
web page) are available. These represent close approximations of the new ones.</p>
        <p>Table 4 shows the results of all participants with regard to the reference alignment
rar2. F0:5-measure, F1-measure and F2-measure are computed for the threshold that
provides the optimal F1-measure. F1 is the harmonic mean of precision and recall where
both are equally weighted; F2 weights recall higher than precision and F0:5 weights
precision higher than recall. The matchers shown in the table are ordered according to their
highest average F1-measure. We employed two baseline matchers. edna (string edit
distance matcher) was used within the benchmark test cases in previous years and with regard to
performance it is very similar as the previously used baseline2 in the conference track;
StringEquiv is used within the anatomy test case. This year these baselines divide matchers into two
performance groups.
8 More details about evaluation applying other sharp reference alignments are available at the
conference web page.</p>
        <p>Matcher</p>
        <p>AML
LogMap</p>
        <p>XMap
LogMapLt</p>
        <p>edna
KEPLER</p>
        <p>WikiV3
StringEquiv</p>
        <p>POMap</p>
        <p>ALIN
SANOM
ONTMAT</p>
        <p>With regard to the two baselines, we can group tools according to each matcher’s
position. In all, four tools outperformed both baselines (AML, LogMap, XMap and LogMapLt),
and two newcomers (KEPLER and WikiV3) performed better than one baseline. Other
matchers (POMap, ALIN, SANOM and ONTMAT) performed worse than both baselines. Four tools
(ALIN, POMap, ONTMAT and SANOM) did not match properties at all. Of course, this had
a negative effect on those tools’ overall performance. More details about evaluation
considering only classes or properties are on the conference web page. The performance of all matchers
(except ONTMAT) regarding their precision, recall and F1-measure is visualised in Figure 2.
Matchers are represented as squares or triangles. Baselines are represented as circles.
Comparison with previous years with regard to rar2 Four matchers, top-performers, also
participated in the Conference test cases in OAEI 2016. None of them improved with regard to
F1-measure evaluation.</p>
        <p>Evaluation based on uncertain version of reference alignments The confidence values
of all matches in the sharp reference alignments for the conference track are all 1.0. For the
uncertain version of this track, the confidence value of a match has been set equal to the percentage
of a group of people who agreed with the match in question (this uncertain version is based on the
reference alignment labeled ra1). One key thing to note is that the group was only asked to
validate matches that were already present in the existing reference alignments – so some matches
had their confidence value reduced from 1.0 to a number near 0, but no new match was added.</p>
        <p>There are two ways that we can evaluate matchers according to these “uncertain” reference
alignments, which we refer to as discrete and continuous. The discrete evaluation considers any
match in the reference alignment with a confidence value of 0.5 or greater to be fully correct
and those with a confidence less than 0.5 to be fully incorrect. Similarly, a matcher’s match is
considered a “yes” if the confidence value is greater than or equal to the matcher’s threshold
and a “no” otherwise. In essence, this is the same as the “sharp” evaluation approach, except
that some matches have been removed because less than half of the crowdsourcing group agreed
with them. The continuous evaluation strategy penalises a matcher more if it misses a match on</p>
        <p>F1-measure=0.7</p>
        <p>F1-measure=0.6
F1-measure=0.5
AML
KEPLER
LogMap
LogMapLt
POMap
SANOM
WikiV3
XMap
edna
StringEquiv
rec=1.0
rec=.8
rec=.6
pre=.6
pre=.8
pre=1.0
which most people agree than if it misses a more controversial match. For instance, if A B
with a confidence of 0.85 in the reference alignment and a matcher gives that correspondence a
confidence of 0.40, then that is counted as 0:85 0:40 = 0:34 true positive and 0:85 0:40 =
0:45 false negative.</p>
        <p>Out of the ten alignment matchers, three (ALIN, LogMapLt and ONTMAT) use 1.0 as the
confidence value for all matches they identify. Two more have a narrow range of confidence
values (POMap’s values vary between 0.8 and 1.0, with the majority falling between 0.93 and
1.0 while SANOM’s values are relatively tightly clustered between 0.73 and 0.9). The remaining
five systems (AML, KEPLER, LogMap, WikiV3 and XMap) have a wide variation of confidence
values.</p>
        <p>When comparing the performance of the matchers on the uncertain reference alignments
versus that on the sharp version (see Table 5), we see that in the discrete case all matchers performed
the same or slightly better. Improvement in F-measure ranged from 0 to 8 percentage points
over the sharp reference alignment. This was driven by increased recall, which is a result of the
presence of fewer “controversial” matches in the uncertain version of the reference alignment.</p>
        <p>The performance of most matchers is very similar regardless of whether a discrete or
continuous evaluation methodology is used (provided that the threshold is optimized to achieve the
highest possible F-measure in the discrete case). The primary exceptions to this are KEPLER,
LogMap and SANOM. These systems perform significantly worse when evaluated using the
continuous version of the metrics. In the LogMap and SANOM cases, this is because the matcher
assigns low confidence values to some matches in which the labels are equivalent strings, which
many crowdsourcers agreed with unless there was a compelling technical reason not to. This hurts
recall, but using a low threshold value in the discrete version of the evaluation metrics ’hides’ this
problem. In the case of KEPLER, the issue is that entities whose labels share a word in common
have fairly high confidence values, even though they are often not equivalent. For example,
“Review” and “Reviewing Event”. This hurts precision in the continuous case, but is taken care of
by using a high threshold value in the discrete case.</p>
        <p>Five matchers from this year also participated last year, and thus we are able to make some
comparisons over time. The F-measures of all systems essentially held constant (within one
percent) when evaluated against the uncertain reference alignments. This is in contrast to last year,
in which most matchers made modest gains (in the neighborhood of 1 to 6 percent) over 2015. It
seems that, barring any new advances, participating matchers have reached something of a steady
state on this performance metric.</p>
      </sec>
      <sec id="sec-5-4">
        <title>Evaluation based on violations of consistency and conservativity principles We per</title>
        <p>formed evaluation based on detection of conservativity and consistency violations [44, 45]. The
consistency principle states that correspondences should not lead to unsatisfiable classes in the
merged ontology; the conservativity principle states that correspondences should not introduce
new semantic relationships between concepts from one of the input ontologies.</p>
        <p>Table 4 shows the number of unsatisfiable TBoxes after the ontologies are merged
(Inc. Align.), the total number of all conservativity principle violations within all alignments
(Conser.V.) and the total number of all consistency principle violations (Consist.V.).</p>
        <p>Five tools (ALIN, AML, LogMap, ONTMAT and POMap) have no consistency principle
violation (in comparison to seven last year) and two tools (SANOM and XMap) generated only
one incoherent alignment. There is one tool (ALIN) having no conservativity principle violations.
Further two tools (ONTMAT and POMap) have an average of conservativity principle violations
around 1. We should note that these conservativity principle violations can be “false positives”
since the entailment in the aligned ontology can be correct although it was not derivable in the
single input ontologies.
4.3</p>
      </sec>
      <sec id="sec-5-5">
        <title>Conclusions</title>
        <p>In conclusion, this year four of ten matchers performed better than both baselines on sharp
reference alignments. Further, this year five matchers generated coherent alignments (against seven
matchers last year and five matchers the year before). Based on the uncertain reference
alignments we can conclude that all matchers perform better on the fuzzy versus sharp version of the
benchmark and eight matchers have close correspondence on the continuous and discrete version,
indicating good agreement with the human matchers. Finally, none of the five matchers that also
participated last year improved their performance with regard to the evaluation based on the sharp
or the uncertain reference alignments.
5</p>
      </sec>
    </sec>
    <sec id="sec-6">
      <title>Large biomedical ontologies (largebio)</title>
      <p>The largebio test case aims at finding alignments between the large and semantically rich
biomedical ontologies FMA, SNOMED-CT, and NCI, which contain 78,989, 306,591 and 66,724
classes, respectively.
The test case has been split into three matching problems: FMA-NCI, FMA-SNOMED and
SNOMED-NCI. Each matching problem has been further divided in 2 tasks involving differently
sized fragments of the input ontologies: small overlapping fragments versus whole ontologies
(FMA and NCI) or large fragments (SNOMED-CT).</p>
      <p>The UMLS Metathesaurus [6] has been selected as the basis for reference alignments. UMLS
is currently the most comprehensive effort for integrating independently-developed medical
thesauri and ontologies, including FMA, SNOMED-CT, and NCI. The extraction of mapping from
UMLS is detailed in [26]).</p>
      <p>Since alignment coherence is an aspect of ontology matching that we aim to promote, in
previous editions we provided coherent reference alignments by refining the UMLS mappings
using the Alcomo (alignment) debugging system [32], LogMap’s (alignment) repair facility [25],
or both [27].</p>
      <p>However, concerns were raised about the validity and fairness of applying automated
alignment repair techniques to make reference alignments coherent [37]. It is clear that using the
original (incoherent) UMLS alignments would be penalizing to ontology matching systems that
perform alignment repair. However, using automatically repaired alignments would penalize
systems that do not perform alignment repair and also systems that employ a repair strategy that
differs from that used on the reference alignments [37].</p>
      <p>Thus, as of the 2014 edition, we arrived at a compromise solution that should be fair to all
ontology matching systems. Instead of repairing the reference alignments as normal, by
removing correspondences, we flagged the incoherence-causing correspondences in the alignments by
setting the relation to “?” (unknown). These “?” correspondences will neither be considered as
positive nor as negative when evaluating the participating ontology matching systems, but will
simply be ignored. This way, systems that do not perform alignment repair are not penalized
for finding correspondences that (despite causing incoherences) may or may not be correct, and
systems that do perform alignment repair are not penalized for removing such correspondences.</p>
      <p>To ensure that this solution was as fair as possible to all alignment repair strategies, we
flagged as unknown all correspondences suppressed by any of Alcomo, LogMap or AML [39], as
well as all correspondences suppressed from the reference alignments of last year’s edition (using
Alcomo and LogMap combined). Note that, we have used the (incomplete) repair modules of
the above mentioned systems.</p>
      <p>The flagged UMLS-based reference alignment for the OAEI 2017 campaign is summarized
in Table 6.</p>
      <p>“=” corresp. “?” corresp.</p>
      <p>FMA-NCI
FMA-SNOMED
SNOMED-NCI
2,686
6,026
17,210
338
2,982
1,634
5.2</p>
      <sec id="sec-6-1">
        <title>Evaluation setting, participation and success</title>
        <p>We have run the evaluation in a Ubuntu Laptop with an Intel Core i7-4600U CPU @ 2.10GHz x 4
and allocating 15Gb of RAM. Precision, Recall and F-measure have been computed with respect
to the UMLS-based reference alignment. Systems have been ordered in terms of F-measure.</p>
        <p>In the OAEI 2017 largebio track 10 out of 21 participating systems have been able to cope
with at least one of the tasks of the largebio track with a 4 hours timeout. Note that we also
include the results of Tool1 (the developers withdrew the system from the campaign) as reference.
9 systems were able to complete more than one task, while 6 systems were able to complete all
tasks. This is an improvement with respect to last year results where only 4 systems were able to
complete all tasks
5.3</p>
      </sec>
      <sec id="sec-6-2">
        <title>Background knowledge</title>
        <p>Regarding the use of background knowledge, LogMap-Bio uses BioPortal as mediating ontology
provider, that is, it (automatically) retrieves from BioPortal the most suitable top-10 ontologies
for the matching task.</p>
        <p>LogMap uses normalisations and spelling variants from the general (biomedical) purpose
UMLS Lexicon (a different resource with respect to the UMLS Metathesaurus).</p>
        <p>AML has three sources of background knowledge which can be used as mediators
between the input ontologies: the Uber Anatomy Ontology (Uberon), the Human Disease Ontology
(DOID) and the Medical Subject Headings (MeSH).</p>
        <p>YAM-BIO uses as background knowledge a file containing mappings from the DOID and
UBERON ontologies to other ontologies like FMA, NCI or SNOMED CT.</p>
        <p>XMAP uses synonyms provided by the UMLS Metathesaurus. Note that matching systems
using UMLS Metathesaurus as background knowledge will have a notable advantage since the
largebio reference alignment is also based on the UMLS Metathesaurus.
5.4</p>
      </sec>
      <sec id="sec-6-3">
        <title>Alignment coherence</title>
        <p>
          Together with Precision, Recall, F-measure and run times we have also evaluated the coherence
of alignments. We report (
          <xref ref-type="bibr" rid="ref1">1</xref>
          ) the number of unsatisfiabilities when reasoning with the input
ontologies together with the computed alignments, and (
          <xref ref-type="bibr" rid="ref2">2</xref>
          ) the ratio of unsatisfiable classes with
respect to the size of the union of the input ontologies.
        </p>
        <p>We have used the OWL 2 reasoner HermiT [35] to compute the number of unsatisfiable
classes. For the cases in which HermiT could not cope with the input ontologies and the
alignments (in less than 2 hours) we have provided a lower bound on the number of unsatisfiable
classes (indicated by ) using the OWL 2 EL reasoner ELK [28].</p>
        <p>In this OAEI edition, only three distinct systems have shown alignment repair facilities: AML,
LogMap and its LogMap-Bio variant, and XMap (which reuses the repair techniques from
Alcomo [32]). Note that only LogMap and LogMap-Bio are able to reduce to a minimum the
number of unsatisfiable classes across all tasks. Missing 9 unsatisfiable classes in the worst case
(whole FMA-NCI task).</p>
        <p>Tables 8-9 (see last two columns) show that even the most precise alignment sets may lead to
a huge number of unsatisfiable classes. This proves the importance of using techniques to assess
the coherence of the generated alignments if they are to be used in tasks involving reasoning. We
encourage ontology matching system developers to develop their own repair techniques or to use
state-of-the-art techniques such as Alcomo [32], the repair module of LogMap (LogMap-Repair)
[25] or the repair module of AML [39], which have worked well in practice [27, 23].</p>
      </sec>
      <sec id="sec-6-4">
        <title>Runtimes and task completion</title>
      </sec>
      <sec id="sec-6-5">
        <title>Results for the FMA-NCI matching problem</title>
        <p>Table 8 summarizes the results for the tasks in the FMA-NCI matching problem.</p>
        <p>XMap and YAM-BIO achieved the highest F-measure in Task 1, while XMap and AML in
Task 2. Note however that the use of background knowledge based on the UMLS Metathesaurus
has an important impact in the performance of XMap. The use of background knowledge led to
*Uses background knowledge based on the UMLS Metathesaurus which is the basis of the
largebio reference alignments.
an improvement in recall from LogMap-Bio over LogMap in both tasks, but this came at the cost
of precision, resulting in the two variants of the system having identical F-measures.</p>
        <p>Note that the effectiveness of the systems decreased from Task 1 to Task 2. One reason for
this is that with larger ontologies there are more plausible mapping candidates, and thus it is
harder to attain both a high precision and a high recall. Another reason is that the very scale
of the problem constrains the matching strategies that systems can employ: AML for example,
foregoes its matching algorithms that are computationally more complex when handling very
large ontologies, due to efficiency concerns.</p>
        <p>The size of Task 2 prove a problem for a number of systems, which were unable to complete
it within the allotted time: POMAP, SANOM, KEPLER and Wiki2.
5.7</p>
      </sec>
      <sec id="sec-6-6">
        <title>Results for the FMA-SNOMED matching problem</title>
        <p>Table 9 summarizes the results for the tasks in the FMA-SNOMED matching problem.</p>
        <p>XMap produced the best results in terms of both Recall and F-measure in Task 3 and Task
4, but again, we must highlight that it uses background knowledge based on the UMLS
Metathesaurus. Among the other systems, AML and YAM-BIO achieved the highest F-measure in Tasks
3 and 4, respectively.
*Uses background knowledge based on the UMLS Metathesaurus which is the basis of the
largebio reference alignments.</p>
        <p>Overall, the quality of the results was lower than that observed in the FMA-NCI matching
problem, as the matching problem is considerable larger. Like in the FMA-NCI matching
problem, the effectiveness off all systems decreases as the ontology size increases from Task 3 to Task
4; and of the systems that completed the former, for example, POMAP was unable to complete
the latter.
5.8</p>
      </sec>
      <sec id="sec-6-7">
        <title>Results for the SNOMED-NCI matching problem</title>
        <p>Table 10 summarizes the results for the tasks in the SNOMED-NCI matching problem.</p>
        <p>AML achieved the best results in terms of both Recall and F-measure in Tasks 5 and 6, while
LogMap and AML achieved the best results in terms of precision in Tasks 5 and 6, respectively.</p>
        <p>The overall performance of the systems was lower than in the FMA-SNOMED case, as this
test case is even larger. Indeed, several systems were unable to complete even the smaller Task 5
within the allotted time: POMAP, SANOM and KEPLER.</p>
        <p>As in the previous matching problems, effectiveness decreased as the ontology size increases.
Unlike in the FMA-NCI and FMA-SNOMED matching problems, the use of the UMLS
Metathesaurus did not positively impact the performance of XMap, which obtained lower results than
expected.
*Uses background knowledge based on the UMLS Metathesaurus which is the basis of the
largebio reference alignments.
6</p>
      </sec>
    </sec>
    <sec id="sec-7">
      <title>Disease and Phenotype Track (phenotype)</title>
      <p>The Pistoia Alliance Ontologies Mapping project team9 organises this track based on a real use
case where it is required to find alignments between disease and phenotype ontologies.
Specifically, in the OAEI 2017 edition of this track the selected ontologies are the Human Phenotype
Ontology (HPO), the Mammalian Phenotype Ontology (MP), the Human Disease Ontology (DOID),
the Orphanet and Rare Diseases Ontology (ORDO), the Medical Subject Headings (MESH)
ontology, and the Online Mendelian Inheritance in Man (OMIM) ontology. The extended results for
the OAEI 2016 Disease and Phenotype track (previous campaign) are available in [24].
The 2017 edition comprises of four tasks requiring the pairwise alignment of:
– Human Phenotype Ontology (HP) to Mammalian Phenotype Ontology (MP);
– Human Disease Ontology (DOID) to the Orphanet Rare Disease Ontology (ORDO);
– Human Phenotype Ontology (HP) to Medical Subject Headings (MESH); and
– Human Phenotype Ontology (HP) to Online Mendelian Inheritance in Man (OMIM).</p>
      <p>Currently, mappings between these ontologies are mostly curated by bioinformatics and
disease experts who would benefit from automation of their workflows supported by implementation
of ontology matching algorithms.
9 http://www.pistoiaalliance.org/projects/ontologies-mapping/
We have run the evaluation in a Ubuntu Laptop with an Intel Core i7-4600U CPU @ 2.10GHz x
4 and allocating 15Gb of RAM.</p>
      <p>In the OAEI 2017 phenotype track 10 out of 21 participating OAEI 2017 systems have been
able to cope with at least one of the tasks with 4 hours.
Systems have been evaluated according to the following criteria:
– Precision and recall with respect to a consensus alignment automatically generated by voting
based on the outputs of all participating systems (we have used vote=2, vote=3 and vote=4).
– Semantic recall with respect to manually generated mappings for several areas of interest
(e.g., carbohydrate, obesity and breast cancer).
– Manual assessment of a subset unique mappings (i.e., mappings that are not suggested by
other systems).</p>
      <p>We have used the OWL 2 reasoner HermiT to calculate the semantic recall. For example,
a positive hit will mean that a mapping in the reference has been (explicitly) included in the
output mappings or it can be inferred using reasoning from the input ontologies and the output
mappings.11.
6.4</p>
      <sec id="sec-7-1">
        <title>Use of background knowledge</title>
        <p>LogMapBio uses BioPortal as mediating ontology provider, that is, it retrieves from BioPortal
the most suitable top-10 ontologies for the matching task.
10 https://www.bioontology.org/wiki/index.php/BioPortal_Mappings
11 Details about the used notion of semantic precision and recall can be found in [24]</p>
        <p>FMA-NCI Small Dataset
N/A 0.995 0.455 0.624 – – –
2 0.996 0.63 0.772 0.996 0.63 0.772
85 0.971 0.614 0.752 0.996 0.63 0.772
152 0.958 0.593 0.733 0.996 0.624 0.767
91 0.937 0.58 0.716 0.996 0.623 0.767
NI stands for non-interactive, and refers to the results obtained by the matching system in the
original track. ALIN was unable to complete the SNOMED-NCI task.
produced by the systems are typically larger than without interaction, which makes the repair
process harder. The introduction of oracle errors complicates the process further, and may make
an alignment irreparable if the system follows the oracle’s feedback blindly.
9</p>
      </sec>
    </sec>
    <sec id="sec-8">
      <title>Instance matching</title>
      <p>The instance matching track aims at evaluating the performance of matching tools when the goal
is to detect the degree of similarity between pairs of items/instances expressed in the form of
OWL Aboxes. The track is organized in two independent tasks called SYNTHETIC and
DOREMUS. Each test is based on two datasets called source and target and the goal is to discover the
matching pairs (i.e., mappings) among the instances in the source dataset and the instances in the
target dataset.</p>
      <p>For the sake of clarity, we split the presentation of he task results in two different subsections.
Task data The SYNTHETIC datasets are produced using SPIMBENCH [40] with the aim to
generate descriptions of the same entity where value-based, structure-based and semantics-aware
transformations are employed on source data in order to create the target data.</p>
      <p>The value-based transformations consider mainly typographical errors and different data
formats, the structure-based transformations implement transformations applied on the structure of
object and datatype properties and the semantics-aware transformations concern the instance level
and take into account schema information. The latter are used to examine if the matching
systems take into account RDFS and OWL constructs in order to discover correspondences between
instances that can be found only by considering schema information.</p>
      <p>We stress that an instance in the source dataset can have none or one matching counterpart in
the target dataset. A dataset is composed of a Tbox and a corresponding Abox. Source and target
datasets share almost the same Tbox (differences are found in the properties due to the employed
structure-based transformations). The Sandbox scale is 10K triples 380 instances while the
Mainbox scale is 50K triples 1800 instances. We asked the participants to match the creative
works (news items, blogposts and programmes) in the source dataset against the instances of the
corresponding class in the target dataset.</p>
      <p>Results The participants of the SYNTHETIC task are the AgreementMakerLight (AML),
IMatch, Legato and LogMap systems. In order to evaluate those systems we built a ground
truth containing the set of expected links where an instance i1 in the source dataset is associated
with an instance j1 in the target dataset that has been generated as a modified description of i1.
The value-based, structure-based and semantics-aware transformations were applied on different
triples of the source dataset pertaining to one class instance.</p>
      <p>The systems were judged on the basis of the precision, recall and F-measure results shown
in Table 19. LogMap and Legato produce links that are very often correct (resulting in a good
precision) but fail to capture a large number of the expected links (resulting in a lower recall).
In the case of AML and I-Match systems, the probability of capturing a correct link is high, but
the probability of a retrieved link to be correct is lower, resulting in a high (almost perfect) recall
but a low precision. Regarding the size of the dataset, LogMap and Legato systems have better
results for the Sandbox dataset. On the other hand, AML and I-Match systems exhibit the same
performance for both the Sandbox and Mainbox datasets.
Task data The DOREMUS task, having its second appearance at the OAEI, contains real world
datasets coming from two major French cultural institutions – The BnF (French National
Library) and the PP (Philharmonie de Paris). The data are about classical music works and follow
the DOREMUS model (one single vocabulary for both datasets) issued from the DOREMUS
project.14 Each data entry, or instance, is a bibliographical record about a musical piece,
containing properties such as the composer, the title(s) of the work, the year of creation, the key, the
genre, the instruments, to name a few. These data have been converted to RDF from their original
UNI- and INTER-MARC formats and anchored to the DOREMUS ontology and a set of domain
controlled vocabularies by the help of the marc2rdf converter,15 developed for this purpose within
the DOREMUS Project (for more details on the conversion method and on the ontology we refer
to [1] and [31]). Note that these data are highly heterogeneous. We have selected works described
both at the BnF and at the PP with different degrees of heterogeneity in their descriptions. The
datasets have been selected for the purposes of two sub-tasks.</p>
      <p>Heterogeneities (HT): This sub-task consists in aligning two datasets, BnF-1 and PP-1,
containing about 238 instances each, by discovering 1:1 equivalence relations between them. There
are different types of heterogeneities that these data manifest, identified by music library experts,
such as multilingualism, differences in catalogs, differences in spelling, different degrees of
description, etc. The goal is to test the ability of linking tools to cope with these heterogeneities.
The participants are asked to map only instances of the F 22 Self Contained Expression
class.</p>
      <p>False Positives Trap (FPT): This sub-task consists in correctly disambiguating the instances
contained in two datasets of small sizes (75 instances each), BnF-2 and PP-2, by discovering 1:1
equivalence relations between the instances that they contain. Librarian experts have selected
several groups of music works with highly similar descriptions across the two datasets, where
there exist only one correct match in each group. The goal is to challenge the linking tools
capacity to avoid the generation of false positives and match correctly instances in the presence
of highly similar but yet distinct candidates. The participants are asked to map only instances of
the F 22 Self Contained Expression class.</p>
      <p>Results Five systems participated and returned results on the DOREMUS track: AML, I-Match,
Legato, LogMap and NjuLink. Two systems stand out, outperforming significantly the other
participants on both sub-tasks – Legato and NjuLink, both achieving F-measures of over 0.9
14 http://www.doremus.org
15 https://github.com/DOREMUS-ANR/marc2rdf
(NjuLink leading on HT and Legato - on FP-trap). Both tasks appear to be fairly challenging for
the majority of the systems, with average F-measures of 0:636 for HT task and 0:565 for the
FP-trap task.
In this track, two benchmark generators are proposed to deal with link discovery for spatial data
represented as trajectories i.e., sequences of longitude, latitude pairs. This new track is using the
HOBBIT platform16 and follows different instructions than the SEALS-based tracks.</p>
      <p>We use TomTom17 datasets in order to create the benchmark. TomTom datasets contain
representations of traces (GPS fixes). Each trace consists of a number of points. Each point has a
timestamp, longitude, latitude and speed. The points are sorted in ascending order by the
timestamp of the corresponding GPS fix. Each task of the HOBBIT Link Discovery Track is composed
of two datasets with different number of instances to match, namely the Sandbox and the
Mainbox.</p>
      <p>The HOBBIT Link Discovery track comprises of two tasks:
– Task 1 (Linking) measures how well the systems can match traces that have been modified
using string-based approaches along with addition and deletion of intermediate points. Since
TomTom datasets only contain coordinates, in order to apply string-based modifications
implemented in LANCE [41] we have replaced a number of those points with labels retrieved
from Linked Data spatial datasets using the Google Maps18, Foursquare19 and Nominatim
Openstreetmap20 APIs. This task also contains modifications on date and coordinate formats.
An instance in the source dataset has one matching counterpart in the target dataset. For the
Linking Task, the Sandbox scale is 100 instances while the Mainbox scale is 5K instances.
We asked the participants to match traces in the source and the target datasets.
The participants of the Linking task are AgreementMakerLight (AML) and OntoIdea
systems. For evaluation, we built a ground truth containing the set of expected links where an
instance i1 in the source dataset is associated with an instance j1 in the target dataset that
has been generated as an altered description of i1.</p>
      <p>The way that the transformations were done, was to apply value-based, and structure-based
transformations on different triples pertaining to instances of class Trace.
16 https://project-hobbit.eu/outcomes/hobbit-platform/
17 https://www.tomtom.com/
18 https://developers.google.com/maps/
19 https://developer.foursquare.com/
20 http://nominatim.openstreetmap.org/
The systems were judged on the basis of precision, recall, F-measure and runtime results that
are shown in Table 21. Both AML and OntoIdea systems return high precision and recall
capturing all the correct links. Regarding runtime, for the Sandbox dataset, AML needs less
time than OntoIdea and for the Mainbox dataset, AML completes the task with perfect results
in contrast to OntoIdea that was not able to complete it and stopped when it hit the platform
time limit (75 mins). Datasets, reference alignments, and task results are available on the
HOBBIT website: https://project-hobbit.eu/challenges/om2017/.
– Task 2 (Spatial) measures how well the systems can identify the DE-9IM (Dimensionally
Extended nine-Intersection Model) topological relations. The supported spatial relations are
the following: Equals, Disjoint, Touches, Contains/Within, Covers/CoveredBy, Intersects,
Crosses, Overlaps. The traces are represented in the Well-known text (WKT) format. For
each relation, a different pair of source and target datasets is given to the participants.
Given a LineString source geometry s, a LineString target geometry t and a DE-9IM
topological relation r, we ask the participants to match an instance from s with one or more
instances in t such as their Intersection Matrix follows the definition of r. For evaluation,
we built a ground truth using RADON [42] containing the set of expected links where an
instance i1 in the source dataset is associated with one or more instances in the target dataset
that has been generated as an altered description of i1. For the Spatial Task, the Sandbox
scale is 10 instances and the Mainbox scale is 2K instances.</p>
      <p>The participants to the Spatial task are AgreementMakerLight (AML), OntoIdea, Rapid
Discovery of Topological Relations (RADON) and Silk systems.</p>
      <p>The systems were judged on the basis of precision, recall, F-measure and runtime results
shown in Table 22 and Figures 8 and 9. We should mention that we are only presenting
the time performance and not precision, recall and f-measure as all were equal to 1.0 except
OntoIdea that reports for the Touches and Overlaps relations value 0.99. Moreover, Silk is
not participating in relations Covers and Covered By and OntoIdea is not participating in
relation Disjoint.</p>
      <p>From the results we can observe that:</p>
      <p>OntoIdea has the best performance in the Sandbox dataset but in the Mainbox dataset
the runtime increases and the system seems to not be able to handle large datasets easily.
Silk also seems to have a similar behaviour as OntoIdea.</p>
      <p>RADON and AML systems seem to handle the growth of the dataset size smoother.
AML does not provide any results for the Disjoint relation since it reaches the platform
time limit
Datasets, reference alignments, and task results are available on the HOBBIT website:
https://project-hobbit.eu/challenges/om2017/.</p>
    </sec>
    <sec id="sec-9">
      <title>Process Model Matching</title>
      <p>In 2013 and in 2015 the community, interested in business process modeling conducted an
evaluation campaign similar to the OAEI [4]. Instead of matching ontologies, the task was to match
process models described in different formalisms like BPMN and Petri Nets. Within this track we
offer a subset of the tasks from the Process Model Matching Contest as OAEI track by converting
the process models to an ontological representation. By offering this track, we hope to gain
insights in how far ontology matching systems are capable of solving the more specific problem of
matching process models. This track is also motivated by the discussions at the end of the 2015
Ontology Matching workshop, where many participants showed their interest in such a track.
We used two datasets from the 2015 Process Matching Contest. The first dataset (University
Admission dataset) deals with processing applications of Master students to a university. It consists
of nine different process models where each describes the concrete process of a specific German
university. We already used that dataset in the 2016 edition of the OAEI. The models are encoded
as BPMN process models. We converted the BPMN representation of the process models to a
set of assertions (ABox) using the vocabulary defined in the BPMN 2.0 ontology (TBox). The
second dataset, known as the Birth Registration dataset, describes the process of registering a
new born child in different countries. The process models were originally available as Petri Nets.
We converted them also to an ABox in an ontological representation. For that reason the resulting
matching tasks are instance matching tasks where each ABox is described by the same TBox.</p>
      <p>For each pair of processes manually generated reference alignments are available. Typical
activities within that domain are Sending acceptance, Invite student for interview, or Wait for
response. These examples illustrate one of the main differences to the ontology matching task.
The labels are usually verb-object phrases that are sometimes extended with more words. Another
important difference is related to the existence of an execution order (i.e., the model is a complex
sequence of activities) which can be understood as the counterpart to a type hierarchy.</p>
      <p>Only three systems generated non-empty results when running them against our datasets.
These systems are AML, LogMap, and I-Match. Note that we tried to execute all systems marked
as instance matching systems. However, the other systems threw exceptions or produced empty
alignments. We have collected all generated non-empty alignments. These alignments are the raw
results that the following report is based on.</p>
      <p>In our evaluation, we computed standard precision and recall, as well as the harmonic mean
known as f-measure. The dataset we used consists of several test cases. We aggregated the results
and present the micro average results. The gold standard we used for our first set of evaluation
experiments is based on the gold standard that has also been used at the Process Model Matching
Contest in 2015 [4]. We modified only some minor mistakes (resulting in changes less than 0.5
percentage points). In order to compare the results to the results obtained by the process model
matching community, we present also the recomputed values of the submissions to the 2015
contest.</p>
      <p>We extent our evaluation (“Standard” in Tables 23 and 24) by an evaluation measure that
makes use of a non-binary reference alignment (“Probabilistic” in Tables 23 and 24). This
probabilistic measure is based on a gold standard which is manually and independently generated by
several domain experts. The number of votes of these annotators are applied as support values in
the probabilistic evaluation. For a detailed discussion, please refer to [29].</p>
      <p>Furthermore, we evaluate the matching systems via matching patterns. Therefore the
matching task as well as the matcher output is automatically categorized into categories with different
complexity level. We classified each alignment in one out of five categories exclusively. In this
way, strength and weaknesses of the matching systems can be analysed. For more details we refer
to [30].
The following tables show the results of our evaluation. Participants of the Process Model
Matching Contest and the OAEI 2016 edition are depicted in gray font, while this years OAEI
participants are shown in black font. Note that some systems participated with a version that has not
been modified with respect to its results comparing the OAEI 2016 and 2017 submission. We
added only one entry for them with the label OAEI-16/17. This is only the case for the first
dataset, which we have used already in 2016.</p>
      <p>Tables 23 and 24 summarize the results of our evaluation. “P” abbreviates precision, “R” is
recall, “FM” stands for f-measure and “Rk” means rank. The prefix “Pro” indicates the
probabilistic versions of the precision, recall, f-measure and the associated rank. The OAEI participants
are ranked on position 1, 11, 12 with an overall number of 17 systems listed in the table (when
using the standard metrics). Note that AML-PM at the PMMC 2015 was a matching system that
was based on a predecessor of AML participating at the OAEI 2016. The good results of AML
are surprising, since we expected that matching systems specifically developed for the purpose of
process model matching would outperform ontology matching systems applied to the special case
of process model matching. While AML contains also components that are specifically designed
for the process matching task (a flooding-like structural matching algorithm), its relevant main
components are developed for ontology matching and the sub-problem of instance matching.
AML and LogMap achieve the same results as in 2016. I-Match participates in 2017 for the first
time. Compared to the results of the tools specialized for the problem of process model matching,
the results of I-Match are still very good. There are still five systems that have in particular been
designed for matching process models, which achieve worse results.</p>
      <p>OAEI-16/17 221
PMMC-15 579
PMMC-15 277
OAEI-16 177
OAEI-16 150
PMMC-15 326
PMMC-15 261
OAEI-16/17 267</p>
      <p>OAEI-17 192
PMMC-15 140
PMMC-15 234
PMMC-15 828
PMMC-15 220
PMMC-15 164
PMMC-15 262
PMMC-15 505
PMMC-15 230
Rk</p>
      <p>The results for the Birth Registration dataset are more interesting, because we are using this
dataset in 2017 for the first time. Moreover, the dataset contains a higher amount of
correspondences that are hard to find by comparing the labels on a lexical level. This results usually in a
significantly lower F-measure compared to the University Admission dataset.</p>
      <p>The results show that AML is no longer the best of all matching systems. Four systems
from the process matching community achieve better results in terms of f-measure. This dataset
is dominated by the OPBOT system, while AML is among a group of follow-up systems that
perform still significantly better than the rest of the field. The other two systems, LogMap and
I-Match, achieve close results which are slightly worse than the average results. It is interesting
to see that the ranking among the three systems is the same across the two datasets.</p>
      <p>In the probabilistic evaluation, in the University Admission dataset however, the OAEI
participants gain position 2, 3, 16 respectively. LogMap rises from position 11 to 3. The (probabilistic)
precision improves over-proportionally for this matcher, because LogMap generates many
correRk
1
15
13
8
9
14
6
11
12
4
5
17
2
3
7
16
10
spondences which are not included in the binary gold standard but are included in the probabilistic
one. The ranking of LogMap demonstrates that a strength of the probabilistic metric lies in the
broadened definition of the gold standard where weak mappings are included but softened (via
the support values). In the probabilistic evaluation for the Birth Registration dataset, the three
participating matchers gain ranking 3, 8 and 10. LogMap rises from rank 11 to 3 in the probabilistic
evaluation. The matcher LogMap mainly identifies correspondences with high support (of which
many are not included in the binary gold standard). For the matcher AML, the opposite effect can
be observed. The matcher AML does not profit as much from the broadened gold standard in the
probabilistic evaluation in the Birth Registration dataset compared to the other matching systems.
The matchers improve their performance compared to the binary evaluation. This indicates that
in the binary gold standard many reasonable alignments are missing. Thus the matchers improve
their performance with the probabilistic evaluation. For details about the probabilistic metric,
please refer to [29].</p>
      <p>The results indicate that the progress made in ontology matching has also a positive impact
on other related matching problems, like it is the case for process model matching. While it
might require to reconfigure, adapt, and extend some parts of the ontology matching systems,
such a system seems to offer a good starting point which can be turned with a reasonable amount
of work into a good process matching tool. We have to emphasize that only three participants
decided to apply their systems to the new track of process model matching. Thus, we have to be
cautious to generalize the results we observed so far.</p>
      <p>To allow for an in-depth analysis of the performance of the matching systems, we make use
of a new evaluation method which automatically classifies the matching task into matching
patterns with different attributes. The matching patterns are assigned automatically to the reference
alignment, as well as to the matcher output of the three participating matchers. Then
categorydependent precision, recall and f-measure are computed for each category separately. For more
details please refer to [30].</p>
      <p>Tables 25 and 26 show the results of the matching systems for each of the categories. The
second column, the f-measure (FM) over all matching patterns, is given as the micro value, i.e. it
Approach FM
AML
I-Match
LogMap
.702
.472
.481
cP
.890 .942 .915
.907 .942 .924
.894 .981 .935
cP
.953 .603 .739
– – –
– – –
cP
.833 .185 .303
.400 .074 .125
.500 .148 .229
cP
.667 .353 .462
– – –
.133 .353 .194
cP
.167 .529 .254
.500 .059 .105
.089 .529 .153
is computed over all test cases. The remaining columns provide the category-dependent precision
(cP), recall (cR) and f-measure (cFM) for each matcher in each category. cP, cR and cFM are
macro values, independently computed for each category. Moreover, for each category, the tables
contain in the heading the fraction of correspondences from the whole data set as well as the total
number of correspondences of a category in the reference alignment. Cat. I contains alignments
which have no word in common (syntactically). It can be observed that for the University
Admission dataset it is sufficient to identify mainly trivial correspondences. I-Match and LogMap do
not compute any alignments of the most complex category (“Cat. I”). However, AML has a very
high performance for “Cat. I”. In the Birth Registration dataset the fraction of trivial alignments
is very low. The most dominant category is “Cat. I”. Therefore, it is not sufficient to focus on the
identification of trivial alignments. In contrast to the University Admission datatset, the matchers
compute reasonable alignments from “Cat. I” in the Birth Registration dataset. The low
performance of the three matchers for “Cat. trivial” in the Birth Registration dataset indicates mistakes
in the binary gold standard.
In 2016 we organized the Process Model Matching track for the first time. Our evaluation effort
was motivated by the idea that Ontology Matching methods and techniques can also be used in
the related field of Process Model Matching. For that reason we converted one (and in 2017 two)
of the most prominent Process Model matching test datasets into an ontological representation.
The resulting matching problems are instance matching tasks.</p>
      <p>While we were aware that an instance matching system will not be able to exploit the
sequential aspects of the given process models out of the box, we expected lexical components to
generate results that are already on an acceptable level. Even though some of the systems
generated very good results, overall only a few of the systems participating at the OAEI were capable
of generating any results for our test cases. We still do not fully understand the reasons for this
outcome.</p>
      <p>In order to facilitate the evaluation process for participants which cannot evaluate their
matchers with SEALS, we developed a web-based evaluation platform21 to potentially increase the
number of participants. This platform was intended to be used by potential participants from the
process matching community that are not interested in an OAEI participation, which is tailored
for ontology matching systems. Within this platform, participants are able to select one or
multiple gold standards for one of the datasets and subsequently upload their corresponding matcher
results. Afterwards, the participants are able to select from a variety of different metrics
including not only different types of precision, recall and f-measure but also general statistics for the
generated output. Unfortunately, no further matching systems participated via the platform.</p>
      <p>The participation rate indicates that only a limited number of participants is interested in
process model matching. For that reason we will not offer a third edition of this track in 2018.
12</p>
    </sec>
    <sec id="sec-10">
      <title>Statistical analysis</title>
      <p>The traditional evaluation carried out in the OAEI tracks consists simply of comparing and
ranking systems based on performance scores such as F-measure. In the case of tracks with multiple
datasets, performance scores are averaged for all datasets, and the systems are compared
accordingly. While performance scores enable us to gage the performance of matching systems
individually, they are insufficient for drawing statistically meaningful comparisons between
systems.</p>
      <p>In the interest of providing a more in-depth comparison of the matching systems that
participated in this year’s competition, this section presents an analysis based on statistical inference.
For one-dataset comparisons, we use McNemar’s test. This test takes as input the alignments
produced by two matching systems plus the reference alignment, and produces as output an indicator
which shows if either system is better than the other or whether they are approximately the same.
This method of comparison does not need a particular performance score to be determined
beforehand. Further, the comparison is not solely based on the juxtaposition of two scalars, but rather, it
is substantiated by the statistical evidence (null hypothesis testing). Two variants of McNemar’s
test were considered: one where false correspondences were ignored so that the comparison was
predicated only on the correct correspondences found by matching systems; and another where
both correct and false correspondences were considered, meaning that systems were compared
based on the full alignment they generated. A directed graph can be used to visualize the
outcome of the test. Interested readers are referred to [34] for more details about the utilization of
this methodology.</p>
      <p>For comparisons over multiple datasets, we used the Friedman test with the corresponding
post-hoc procedure for comparison. This test requires the specification of one performance score.
The outcome of the test can be visualized by critical difference (CD) diagrams.</p>
      <p>Since the comparisons between matching systems are done pairwise, it is necessary to correct
the statistics for multiple testing. We used the Bergmann correction method to control the
familywise error rate in all tests.
21 http://alkmaar.informatik.uni-mannheim.de/pmmc
Anatomy track In this year’s competition, 11 systems participate in the anatomy track.
However, the alignments of the LogMap family could not be parsed by the Alignment API, so we had
to leave them out from the comparative analysis for this track.</p>
      <p>Figure 10 shows the directed graph with the outcome of McNemar’s test over participatory
systems when the false correspondences are not taken into account. Figure 11 shows the
corresponding result when all correspondences are considered. The nodes in these graphs are the
systems and a directed edge A ! B indicates the superiority of A over B. If there is no such an
edge between any two systems, then they are claimed to be more or less equivalent.</p>
      <p>According to these figures, AML is the best system and Wiki3 and ALIN are the bottom
ones, from both perspectives. There are two differences between the two approaches to
conducting the test. SANOM outperforms KEPLER when the false correspondences are not considered,
and KEPLER is better than SANOM if wrong correspondences are taken into account. It means
that SANOM discovers more correct correspondences than KEPLER, but also more false
correspondences. A similar pattern holds for the comparison of POMAP and YAM-BIO. Interestingly,
no systems are declared to be equivalent, so the outcome of McNemar’s test is similar to a ranking
scheme.</p>
      <p>POMAP</p>
      <p>AML</p>
      <p>XMap
KEPLER</p>
      <p>YAM-BIO</p>
      <p>SANOM</p>
      <p>WikiV3</p>
      <p>ALIN
Fig. 10. Comparison of alignment systems participated in OAEI 2017 on the anatomy track while
the false correspondences are not considered.</p>
      <p>Conference track This track consists of 21 small matching tasks between 7 different
ontologies. Three different types of matching are considered: (i) M1: only matching the classes; (i)
M2: only matching the properties; (ii) M3: matching both classes and properties. The reference
alignment has also three different variants. Hence, there are nine different modes of evaluating
systems, based on the type of matching and the type of reference alignment. The Friedman test
YAM-BIO</p>
      <p>XMap
Fig. 11. Comparison of alignment systems participated in OAEI 2017 on the anatomy track while
the false correspondences are taken into account.
was applied considering the F-measure of the systems on each of the 21 tasks for each of the
evaluation modes.</p>
      <p>Figure 12 shows the CD diagram of the systems that participated in this track. In this figure,
the x axis is the average rank obtained by the Friedman test, and the systems with the same
performance are connected to each other by the red lines. The lower the average rank in the CD
plot, the better the performance of the system.</p>
      <p>The CD diagram for this track provides little information and insight about the difference
between systems, likely due to the small sample size for the comparison (systems produce only
between 90 and 240 correspondences in total in this track). What is readily seen from this plot is
the superiority of AML, LogMap, and XMap and the poor performance of ALIN, SANOM, and
POMap.</p>
      <p>Fig. 12. Comparison of alignment systems participated in OAEI 2017 on the Conference track.
The x-axis is the average rank of each system obtained by the Friedman test. Systems which are
not significantly different from each other are connected by the red lines.</p>
      <p>LargeBio track This track consists six matching tasks of large size. The Friedman test was
applied to the F-measure obtained by each system over each alignment task. Figure 13 shows
the corresponding CD diagram for this track. According to this plot, the group containing AML,
XMap, YAM-BIO, LogMap, and LogMapBio are the best systems, and POMAP, SANOM, and
KEPLER are the systems with lackluster performance in this track.</p>
      <p>Fig. 13. Comparison of alignment systems participated in OAEI 2017 on the LargeBio track. The
x-axis is the average rank of each system obtained by the Friedman test. Systems which are not
significantly different from each other are connected by the red lines.</p>
      <p>Multifarm track This track involves 55 matching tasks with ontologies from different
languages. The Friedman test was applied to the F-measure obtained by each system over each task.
The CD diagram depicting the outcome of the test is shown in Figure 14.</p>
      <p>According to this graph, AML is exclusively the best alignment system in this track. LogMap,
CroLOM, and KEPLER perform equally better than the remaining systems. At the other extreme,
LogMapLite, XMap, and SANOM show a poor performance in this track, while WikiV3 ranks
in between the two trios.</p>
      <p>Fig. 14. Comparison of alignment systems participated in OAEI 2017 on the Conference track.
The x-axis is the average rank of each system obtained by the Friedman test. Systems which are
not significantly different from each other are connected by the red lines.
13</p>
    </sec>
    <sec id="sec-11">
      <title>Lesson learned and suggestions</title>
      <p>The lessons learned from running OAEI 2017 were the following:
A) Like last year, this year we requested tool registration in June and preliminary submission of
wrapped systems by the end of July, but were more strict in its enforcement. As a result, we
recorded the smallest number of errors and incompatibilities with the SEALS client during
the evaluation phase in recent OAEI editions.</p>
      <p>B) As has been the trend, some system developers struggled to get their systems working with
the SEALS client, mostly due to incompatible versions of libraries. While participation on
the new HOBBIT track was relatively low due to the novelty of the HOBBIT platform and
the short deadline for systems to adapt to it, the solution of using Docker containers to wrap
systems seems promising, and we are already looking into phasing out the SEALS client in
favour of the HOBBIT platform.</p>
      <p>C) While the number of participants this year was similar to that of recent years, their
distribution through the tracks was uneven. The expressive ontologies tracks had no shortage of
participants, and still a fair number participated in the more specialized multifarm track.
However, participation in the interactive matching track and in the three instance
matching tracks (process model, instance, and hobbit) was underwhelming. The latter is puzzling
considering the prize sponsored by IBM Research for the system with the best performance
across the instance matching tracks. Granted, the division of instance matching tracks
between the SEALS client and the HOBBIT platform did not help their cause, as of the 7
total systems that participated in instance matching tasks, only 2 made both a SEALS and
a HOBBIT submission. Nevertheless, the division between “traditional” ontology matching
and instance matching is readily apparent, as only 2 systems have participated in both track
families.</p>
      <p>D) In previous years we identified the need for considering non-binary forms of evaluation,
namely in cases where there is uncertainty about some of the reference mappings. A first
non-binary evaluation type was implemented in the Conference track in 2015, followed by
Disease and Phenotype, and Process Model in 2016. This year, we have introduced statistical
tests to compare matching systems, an analysis that was carried out on the results of 4 tracks.
This approach provides more insights into the comparative performance of systems as well
as more statistical rigour, and thus we hope that it can be expanded and fully integrated into
the OAEI tracks in future editions.</p>
      <p>The lessons learned in the various OAEI 2017 track were the following:
conference: Since there have been no improvement in matchers performance this year from the
perspective of performed evaluation modalities we will consider to add or replace existing
evaluation modalities for future editions of OAEI to help disclose further matchers
characteristics.
largebio: While the current reference alignments, with incoherence-causing mappings flagged
as uncertain, make the evaluation fair to all systems, they are only a compromise solution,
not an ideal one. Thus, we should aim for manually repairing and validating the reference
alignments for future editions.
phenotype: This track attracted a similar level of participation this year compared to last, despite
no cash prize, which demonstrates its intrinsic value and interest among the community of
ontology matching algorithm developers.
interactive: This track’s participation has remained low, as most systems participating in OAEI
opt to focus exclusively on fully automatic matching. We hope to draw more participants to
this track in the future and will continue to expand it so as to better approximate real user
interactions.
process model: The results of the Process Model track have shown that the participating
ontology matching systems are capable of generating good results for the specific problem of
process model matching, even though few were able to exploit the sequential aspects of the
process models. Even though we offered an alternative evaluation process for participants
which cannot evaluate their matchers with SEALS, this alternative failed to attract further
participants. The low participation rate in this track indicates that only a limited number of
participants is interested in process model matching. For that reason we will not offer a third
edition of this track in 2018.
instance: In order to attract more instance matching systems to participate in value semantics
(val-sem), value structure (val-struct), and value structure semantics (val-struct-sem) tasks,
we need to produce benchmarks that have fewer instances (in the order of 10000), of the
same type (in our benchmark we asked systems to compare instances of different types).
To balance those aspects, we must then produce benchmarks that contain more complex
transformations.
14
The OAEI 2017 saw the same number of participants as in recent years, with a healthy mix of new
and returning systems. While last year we posited that new participants were drawn by the allure
of prize money in the new Disease and Phenotype track, the evidence this year seems to contradict
it. On the one hand, participation in Disease and Phenotype remain high this year despite no prize
money. On the other hand, the prize money on offer for performance in instance matching did
not attract many participants to those tracks. Nevertheless, the fact that there continues to be
corporate interest in ontology matching to the point of offering prize money bodes well for the
future of the OAEI.</p>
      <p>Like last year, judging from the repeated tracks, there has been no substantial progress to the
state of the art in ontology matching overall this year:
– There was no noticeable improvement with regard to system run times.
– There were few improvements with regard to F-measure, with the top results in most tracks
remaining the same.
– There was no significant progress with regard to the ability of matching systems to handle
large ontologies and datasets, either in traditional ontology matching or in instance matching.
– There was no progress with regard to alignment repair systems, with only a few returning
systems employing them.</p>
      <p>This conclusion may be due to a plateau being reached by matching systems in some tracks, and
investing in improving results further would bring diminishing returns. However, it is also the
case that long-term participants tend to focus more on the new datasets and tracks on offer than
on improving in repeated tracks. Given the variety of tracks on offer, it is difficult for system
developers to aim at improving across all tracks each year.</p>
      <p>Most of the participants have provided a description of their systems and their experience in
the evaluation. These OAEI papers, like the present one, have not been peer reviewed. However,
they are full contributions to this evaluation exercise and reflect the hard work and clever insight
people put into the development of participating systems. Reading the papers of the participants
should help people involved in ontology matching find out what makes these algorithms work
and what could be improved.</p>
      <p>The Ontology Alignment Evaluation Initiative will strive to remain a reference to the
ontology matching community by improving both the test cases and the testing methodology to better
reflect actual needs, as well as to promote progress in this field [43]. More information can be
found at:</p>
    </sec>
    <sec id="sec-12">
      <title>Acknowledgements</title>
      <p>We warmly thank the participants of this campaign. We know that they have worked hard to have
their matching tools executable in time and they provided useful reports on their experience. The
best way to learn about the results remains to read the papers that follow.</p>
      <p>We would also like to thank IBM Research for sponsoring the instance matching tracks by
offering prize money for the best performing systems.</p>
      <p>We are grateful to the Universidad Polite´cnica de Madrid (UPM), especially to Nandana
Mihindukulasooriya and Asunci o´n Go´ mez Pe´rez, for moving, setting up and providing the necessary
infrastructure to run the SEALS repositories.</p>
      <p>We are also grateful to Martin Ringwald and Terry Hayamizu for providing the reference
alignment for the anatomy ontologies and thank Elena Beisswanger for her thorough support on
improving the quality of the data set.</p>
      <p>We thank Khiat Abderrahmane for his support in the Arabic data set and Catherine Comparot
for her feedback and support in the MultiFarm test case.</p>
      <p>We also thank for their support the other members of the Ontology Alignment Evaluation
Initiative steering committee: Yannis Kalfoglou (Ricoh laboratories, UK), Miklos Nagy (The Open
University (UK), Natasha Noy (Google Inc., USA), Yuzhong Qu (Southeast University, CN),
York Sure (Leibniz Gemeinschaft, DE), Jie Tang (Tsinghua University, CN), Heiner
Stuckenschmidt (Mannheim Universita¨t, DE), George Vouros (University of the Aegean, GR).</p>
      <p>Michelle Cheatham has been supported by the National Science Foundation award
ICER1440202 “EarthCube Building Blocks: Collaborative Proposal: GeoLink”.</p>
      <p>Je´roˆ me Euzenat, Ernesto Jimenez-Ruiz, Christian Meilicke, Heiner Stuckenschmidt and
Ca´ssia Trojahn dos Santos have been partially supported by the SEALS (IST-2009-238975)
European project in previous years.</p>
      <p>Daniel Faria was supported by the ELIXIR-EXCELERATE project (INFRADEV-3-2015).</p>
      <p>Ernesto Jimenez-Ruiz has also been partially supported by the BIGMED project (IKT
259055), the HealthInsight project (IKT 247784), the SIRIUS Centre for Scalable Data Access
(Research Council of Norway, project no.: 237889).</p>
      <p>Catia Pesquita was supported by the FCT through the LASIGE Strategic Project
(UID/CEC/00408/2013) and the research grant PTDC/EEI-ESS/4633/2014.
4. Gonc¸alo Antunes, Marzieh Bakhshandeh, Jose´ Borbinha, Joa˜o Cardoso, Sharam
Dadashnia, Chiara Di Francescomarino, Mauro Dragoni, Peter Fettke, Avigdor Gal, Chiara
Ghidini, Philip Hake, Abderrahmane Khiat, Christopher Klinkmu¨ller, Elena Kuss, Henrik
Leopold, Peter Loos, Christian Meilicke, Tim Niesen, Catia Pesquita, Timo Pe´us, Andreas
Schoknecht, Eitam Sheetrit, Andreas Sonntag, Heiner Stuckenschmidt, Tom Thaler, Ingo
Weber, and Matthias Weidlich. The process model matching contest 2015. In 6th EMISA
Workshop, pages 127–155, 2015.
5. Benhamin Ashpole, Marc Ehrig, Je´roˆme Euzenat, and Heiner Stuckenschmidt, editors. Proc.</p>
      <p>
        K-Cap Workshop on Integrating Ontologies, Banff (Canada), 2005.
6. Olivier Bodenreider. The unified medical language system (UMLS): integrating biomedical
terminology. Nucleic Acids Research, 32:267–270, 2004.
7. Caterina Caracciolo, Je´roˆme Euzenat, Laura Hollink, Ryutaro Ichise, Antoine Isaac,
Ve´ronique Malaise´, Christian Meilicke, Juan Pane, Pavel Shvaiko, Heiner Stuckenschmidt,
Ondrej Sva´b-Zamazal, and Vojtech Sva´tek. Results of the ontology alignment evaluation
initiative 2008. In Proc. 3rd ISWC ontology matching workshop (OM), Karlsruhe (DE), pages
73–120, 2008.
8. Michelle Cheatham, Zlatan Dragisic, Je´roˆme Euzenat, Daniel Faria, Alfio Ferrara, Giorgos
Flouris, Irini Fundulaki, Roger Granada, Valentina Ivanova, Ernesto Jime´nez-Ruiz, et al.
Results of the ontology alignment evaluation initiative 2015. In Proc. 10th ISWC ontology
matching workshop (OM), Bethlehem (PA, US), pages 60–115, 2015.
9. Bernardo Cuenca Grau, Zlatan Dragisic, Kai Eckert, Je´roˆme Euzenat, Alfio Ferrara, Roger
Granada, Valentina Ivanova, Ernesto Jime´nez-Ruiz, Andreas Oskar Kempf, Patrick
Lambrix, Andriy Nikolov, Heiko Paulheim, Dominique Ritze, Franc¸ois Scharffe, Pavel Shvaiko,
Ca´ssia Trojahn dos Santos, and Ondrej Zamazal. Results of the ontology alignment
evaluation initiative 2013. In Pavel Shvaiko, Je´roˆme Euzenat, Kavitha Srinivas, Ming Mao, and
Ernesto Jime´nez-Ruiz, editors, Proc. 8th ISWC ontology matching workshop (OM), Sydney
(NSW, AU), pages 61–100, 2013.
10. Jim Dabrowski and Ethan V. Munson. 40 years of searching for the best computer system
response time. Interacting with Computers, 23(5):555–564, 2011.
11. Je´roˆme David, Je´roˆme Euzenat, Franc¸ois Scharffe, and Ca´ssia Trojahn dos Santos. The
alignment API 4.0. Semantic web journal, 2(
        <xref ref-type="bibr" rid="ref1">1</xref>
        ):3–10, 2011.
12. Ca´ssia Trojahn dos Santos, Bo Fu, Ondrej Zamazal, and Dominique Ritze. State-of-the-art
in multilingual and cross-lingual ontology matching. In Towards the Multilingual Semantic
Web, Principles, Methods and Applications, pages 119–135. 2014.
13. Zlatan Dragisic, Kai Eckert, Je´roˆme Euzenat, Daniel Faria, Alfio Ferrara, Roger Granada,
Valentina Ivanova, Ernesto Jime´nez-Ruiz, Andreas Oskar Kempf, Patrick Lambrix,
Stefano Montanelli, Heiko Paulheim, Dominique Ritze, Pavel Shvaiko, Alessandro Solimando,
Ca´ssia Trojahn dos Santos, Ondrej Zamazal, and Bernardo Cuenca Grau. Results of the
ontology alignment evaluation initiative 2014. In Proc. 9th ISWC ontology matching workshop
(OM), Riva del Garda (IT), pages 61–104, 2014.
14. Zlatan Dragisic, Valentina Ivanova, Patrick Lambrix, Daniel Faria, Ernesto Jime´nez-Ruiz,
and Catia Pesquita. User validation in ontology alignment. In The Semantic Web - ISWC
2016 - 15th International Semantic Web Conference, Kobe, Japan, October 17-21, 2016,
Proceedings, Part I, pages 200–217, 2016.
15. Zlatan Dragisic, Valentina Ivanova, Huanyu Li, and Patrick Lambrix. Experiences from
the anatomy track in the ontology alignment evaluation initiative. Journal of Biomedical
Semantics, 2017.
16. Je´roˆme Euzenat, Alfio Ferrara, Laura Hollink, Antoine Isaac, Cliff Joslyn, Ve´ronique
Malaise´, Christian Meilicke, Andriy Nikolov, Juan Pane, Marta Sabou, Franc¸ois Scharffe,
Pavel Shvaiko, Vassilis Spiliopoulos, Heiner Stuckenschmidt, Ondrej Sva´b-Zamazal,
Vojtech Sva´tek, Ca´ssia Trojahn dos Santos, George Vouros, and Shenghui Wang. Results of
the ontology alignment evaluation initiative 2009. In Proc. 4th ISWC ontology matching
workshop (OM), Chantilly (VA, US), pages 73–126, 2009.
17. Je´roˆme Euzenat, Alfio Ferrara, Christian Meilicke, Andriy Nikolov, Juan Pane, Franc¸ois
Scharffe, Pavel Shvaiko, Heiner Stuckenschmidt, Ondrej Sva´b-Zamazal, Vojtech Sva´tek, and
Ca´ssia Trojahn dos Santos. Results of the ontology alignment evaluation initiative 2010. In
Proc. 5th ISWC ontology matching workshop (OM), Shanghai (CN), pages 85–117, 2010.
18. Je´roˆme Euzenat, Alfio Ferrara, Robert Willem van Hague, Laura Hollink, Christian
Meilicke, Andriy Nikolov, Franc¸ois Scharffe, Pavel Shvaiko, Heiner Stuckenschmidt, Ondrej
Sva´b-Zamazal, and Ca´ssia Trojahn dos Santos. Results of the ontology alignment
evaluation initiative 2011. In Proc. 6th ISWC ontology matching workshop (OM), Bonn (DE),
pages 85–110, 2011.
19. Je´roˆme Euzenat, Antoine Isaac, Christian Meilicke, Pavel Shvaiko, Heiner Stuckenschmidt,
Ondrej Svab, Vojtech Svatek, Willem Robert van Hage, and Mikalai Yatskevich. Results of
the ontology alignment evaluation initiative 2007. In Proc. 2nd ISWC ontology matching
workshop (OM), Busan (KR), pages 96–132, 2007.
20. Je´roˆme Euzenat, Christian Meilicke, Pavel Shvaiko, Heiner Stuckenschmidt, and Ca´ssia
Trojahn dos Santos. Ontology alignment evaluation initiative: six years of experience. Journal
on Data Semantics, XV:158–192, 2011.
21. Je´roˆme Euzenat, Malgorzata Mochol, Pavel Shvaiko, Heiner Stuckenschmidt, Ondrej Svab,
Vojtech Svatek, Willem Robert van Hage, and Mikalai Yatskevich. Results of the ontology
alignment evaluation initiative 2006. In Proc. 1st ISWC ontology matching workshop (OM),
Athens (GA, US), pages 73–95, 2006.
22. Je´roˆme Euzenat and Pavel Shvaiko. Ontology matching. Springer-Verlag, Heidelberg (DE),
2nd edition, 2013.
23. Daniel Faria, Ernesto Jime´nez-Ruiz, Catia Pesquita, Emanuel Santos, and Francisco M.
      </p>
      <p>Couto. Towards Annotating Potential Incoherences in BioPortal Mappings. In 13th
International Semantic Web Conference, volume 8797 of Lecture Notes in Computer Science,
pages 17–32. Springer, 2014.
24. Ian Harrow, Ernesto Jime´nez-Ruiz, Andrea Splendiani, Martin Romacker, Peter Woollard,
Scott Markel, Yasmin Alam-Faruque, Martin Koch, James Malone, and Arild Waaler.
Matching Disease and Phenotype Ontologies in the Ontology Alignment Evaluation Initiative.</p>
      <p>Journal of Biomedical Semantics, 2018.
25. Ernesto Jime´nez-Ruiz and Bernardo Cuenca Grau. LogMap: Logic-based and scalable
ontology matching. In Proc. 10th International Semantic Web Conference (ISWC), Bonn (DE),
pages 273–288, 2011.
26. Ernesto Jime´nez-Ruiz, Bernardo Cuenca Grau, Ian Horrocks, and Rafael Berlanga.
Logicbased assessment of the compatibility of UMLS ontology sources. J. Biomed. Sem., 2, 2011.
27. Ernesto Jime´nez-Ruiz, Christian Meilicke, Bernardo Cuenca Grau, and Ian Horrocks.
Evaluating mapping repair systems with large biomedical ontologies. In Proc. 26th Description
Logics Workshop, 2013.
28. Yevgeny Kazakov, Markus Kro¨tzsch, and Frantisek Simancik. Concurrent classification of
EL ontologies. In Proc. 10th International Semantic Web Conference (ISWC), Bonn (DE),
pages 305–320, 2011.
29. Elena Kuss, Henrik Leopold, Han Van der Aa, Heiner Stuckenschmidt, and Hajo A Reijers.</p>
      <p>Probabilistic evaluation of process model matching techniques. In Conceptual Modeling:
35th International Conference, ER 2016, Gifu, Japan, November 14-17, 2016, Proceedings
35, pages 279–292. Springer, 2016.
30. Elena Kuss and Heiner Stuckenschmidt. Automatic classification to matching patterns for
process model matching evaluation. In Proceedings of the ER Forum 2017 and the ER 2017
Demo Track co-located with the 36th International Conference on Conceptual Modelling
(ER 2017), Valencia, Spain, - November 6-9, 2017., pages 292–305, 2017.
31. Pasquale Lisena, Manel Achichi, Eva Ferna´ndez, Konstantin Todorov, and Raphae¨l Troncy.</p>
      <p>Exploring linked classical music catalogs with overture. In ISWC PD: International
Semantic Web Conference Posters and Demos, 2016.
32. Christian Meilicke. Alignment Incoherence in Ontology Matching. PhD thesis, University</p>
      <p>
        Mannheim, 2011.
33. Christian Meilicke, Rau´l Garc´ıa Castro, Frederico Freitas, Willem Robert van Hage, Elena
Montiel-Ponsoda, Ryan Ribeiro de Azevedo, Heiner Stuckenschmidt, Ondrej Sva´b-Zamazal,
Vojtech Sva´tek, Andrei Tamilin, Ca´ssia Trojahn, and Shenghui Wang. MultiFarm: A
benchmark for multilingual ontology matching. Journal of web semantics, 15(
        <xref ref-type="bibr" rid="ref3">3</xref>
        ):62–68, 2012.
34. Majid Mohammadi, Amir Ahooye Atashin, Wout Hofman, and Yaohua Tan. Comparison
of ontology alignment algorithms across single matching task via the McNemar test. arXiv,
arXiv:1704.00045.
35. Boris Motik, Rob Shearer, and Ian Horrocks. Hypertableau reasoning for description logics.
      </p>
      <p>
        Journal of Artificial Intelligence Research, 36:165–228, 2009.
36. Heiko Paulheim, Sven Hertling, and Dominique Ritze. Towards evaluating interactive
ontology matching tools. In Proc. 10th Extended Semantic Web Conference (ESWC), Montpellier
(FR), pages 31–45, 2013.
37. Catia Pesquita, Daniel Faria, Emanuel Santos, and Francisco Couto. To repair or not to
repair: reconciling correctness and coherence in ontology reference alignments. In Proc. 8th
ISWC ontology matching workshop (OM), Sydney (AU), page this volume, 2013.
38. Manuel Salvadores, Paul R. Alexander, Mark A. Musen, and Natalya Fridman Noy.
BioPortal as a dataset of linked biomedical ontologies and terminologies in RDF. Semantic Web,
4(
        <xref ref-type="bibr" rid="ref3">3</xref>
        ):277–284, 2013.
39. Emanuel Santos, Daniel Faria, Catia Pesquita, and Francisco Couto. Ontology alignment
repair through modularization and confidence-based heuristics. CoRR, abs/1307.5322, 2013.
40. T. Saveta, E. Daskalaki, G. Flouris, I. Fundulaki, M. Herschel, and A.-C. Ngonga Ngomo.
      </p>
      <p>Pushing the limits of instance matching systems: A semantics-aware benchmark for linked
data. In WWW, Companion Volume, 2015.
41. Tzanina Saveta, Evangelia Daskalaki, Giorgos Flouris, Irini Fundulaki, Melanie Herschel,
and Axel-Cyrille Ngonga Ngomo. Lance: Piercing to the heart of instance matching tools.</p>
      <p>In International Semantic Web Conference, pages 375–391. Springer, 2015.
42. Mohamed Ahmed Sherif, Kevin Dreßler, Panayiotis Smeros, and Axel-Cyrille Ngonga
Ngomo. RADON - Rapid Discovery of Topological Relations. In Proceedings of The
ThirtyFirst AAAI Conference on Artificial Intelligence (AAAI-17), 2017.
43. Pavel Shvaiko and Je´roˆme Euzenat. Ontology matching: state of the art and future challenges.</p>
      <p>
        IEEE Transactions on Knowledge and Data Engineering, 25(
        <xref ref-type="bibr" rid="ref1">1</xref>
        ):158–176, 2013.
44. Alessandro Solimando, Ernesto Jime´nez-Ruiz, and Giovanna Guerrini. Detecting and
correcting conservativity principle violations in ontology-to-ontology mappings. In The
Semantic Web–ISWC 2014, pages 1–16. Springer, 2014.
45. Alessandro Solimando, Ernesto Jimenez-Ruiz, and Giovanna Guerrini. Minimizing
conservativity violations in ontology alignments: Algorithms and evaluation. Knowledge and
Information Systems, 2016.
46. York Sure, Oscar Corcho, Je´roˆme Euzenat, and Todd Hughes, editors. Proc. ISWC Workshop
on Evaluation of Ontology-based Tools (EON), Hiroshima (JP), 2004.
      </p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          1.
          <string-name>
            <given-names>Manel</given-names>
            <surname>Achichi</surname>
          </string-name>
          , Rodolphe Bailly, Ce´cile Cecconi, Marie Destandau, Konstantin Todorov, and Raphae¨l Troncy.
          <article-title>Doremus: Doing reusable musical data</article-title>
          .
          <source>In ISWC PD: International Semantic Web Conference Posters and Demos</source>
          ,
          <year>2015</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          2.
          <string-name>
            <given-names>Manel</given-names>
            <surname>Achichi</surname>
          </string-name>
          , Michelle Cheatham, Zlatan Dragisic, Jerome Euzenat, Daniel Faria, Alfio Ferrara, Giorgos Flouris, Irini Fundulaki, Ian Harrow, Valentina Ivanova, Ernesto Jime´nezRuiz,
          <string-name>
            <surname>Elena</surname>
            <given-names>Kuss</given-names>
          </string-name>
          , Patrick Lambrix, Henrik Leopold,
          <string-name>
            <given-names>Huanyu</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Christian</given-names>
            <surname>Meilicke</surname>
          </string-name>
          , Stefano Montanelli, Catia Pesquita, Tzanina Saveta, Pavel Shvaiko, Andrea Splendiani, Heiner Stuckenschmidt, Konstantin Todorov, Ca´ssia Trojahn, and
          <string-name>
            <given-names>Ondrej</given-names>
            <surname>Zamazal</surname>
          </string-name>
          .
          <article-title>Results of the ontology alignment evaluation initiative 2016</article-title>
          .
          <source>In Proc. 11th ISWC ontology matching workshop (OM)</source>
          ,
          <source>Kobe (JP)</source>
          , pages
          <fpage>73</fpage>
          -
          <lpage>129</lpage>
          ,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          3. Jose´ Luis Aguirre, Bernardo Cuenca Grau, Kai Eckert, Je´roˆ me Euzenat, Alfio Ferrara, Robert Willem van Hague,
          <string-name>
            <surname>Laura Hollink</surname>
          </string-name>
          , Ernesto Jime´
          <article-title>nez-</article-title>
          <string-name>
            <surname>Ruiz</surname>
            ,
            <given-names>Christian</given-names>
          </string-name>
          <string-name>
            <surname>Meilicke</surname>
          </string-name>
          , Andriy Nikolov, Dominique Ritze, Franc¸ois Scharffe, Pavel Shvaiko, Ondrej Sva´
          <fpage>b</fpage>
          -Zamazal, Ca´ssia Trojahn, and
          <string-name>
            <given-names>Benjamin</given-names>
            <surname>Zapilko</surname>
          </string-name>
          .
          <article-title>Results of the ontology alignment evaluation initiative 2012</article-title>
          .
          <source>In Proc. 7th ISWC ontology matching workshop (OM)</source>
          , Boston (MA, US), pages
          <fpage>73</fpage>
          -
          <lpage>115</lpage>
          ,
          <year>2012</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>