<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Multimedia information extraction from HTML product catalogues</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Martin</forename><surname>Labský</surname></persName>
							<email>labsky@vse.cz</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Pavel</forename><surname>Praks</surname></persName>
							<email>pavel.praks@vsb.cz</email>
							<affiliation key="aff1">
								<orgName type="department">Department of Applied Mathematics</orgName>
								<orgName type="institution">V ŠB -Technical University of Ostrava</orgName>
								<address>
									<addrLine>17. listopadu 15</addrLine>
									<postCode>708 33</postCode>
									<settlement>Ostrava-Poruba</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="department">Department of Mathematics and Descriptive Geometry</orgName>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Department of Applied Mathematics</orgName>
								<orgName type="institution">Technical University of Ostrava</orgName>
								<address>
									<addrLine>17. listopadu 15</addrLine>
									<postCode>708 33</postCode>
									<settlement>Ostrava-Poruba</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Department of Applied Mathematics</orgName>
								<orgName type="institution">V ŠB -Technical University of Ostrava</orgName>
								<address>
									<addrLine>17. listopadu 15</addrLine>
									<postCode>708 33</postCode>
									<settlement>Ostrava-Poruba</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="department">Department of Mathematics and Descriptive Geometry</orgName>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Department of Applied Mathematics</orgName>
								<orgName type="institution">Technical University of Ostrava</orgName>
								<address>
									<addrLine>17. listopadu 15</addrLine>
									<postCode>708 33</postCode>
									<settlement>Ostrava-Poruba</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Vojtěch</forename><surname>Svátek</surname></persName>
							<email>svatek@vse.cz</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ondřej</forename><surname>Šváb</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff0">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="department">Department of Information and Knowledge Engineering</orgName>
								<orgName type="institution">University of Economics</orgName>
								<address>
									<addrLine>W. Churchill Sq. 4</addrLine>
									<postCode>130 67</postCode>
									<settlement>Prague, Praha 3</settlement>
									<country key="CZ">Czech Republic</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Multimedia information extraction from HTML product catalogues</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">A42BA3D478C03A83D2C0AA88FC1A412B</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-24T06:47+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>We describe a demo application of information extraction from company websites, focusing on bicycle product offers. A statistical approach (Hidden Markov Models) is used in combination with different ways of image classification, including latent semantic analysis of image collections. Ontological knowledge is used to group the extracted items into structured objects. The results are stored in an RDF repository and made available for structured search.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>Tools and techniques for web information extraction (WIE) have recently been recognised as one of key enablers for semantic web (SW) scaling. In our long-term project named Rainbow <ref type="foot" target="#foot_0">3</ref> we address several intertwined topics that we consider important for efficient 'WIE for SW' applications:</p><p>1. Exploitation of multiple information modalities available in web documents 2. Synergy of learning and reuse of ontological information 3. Automated acquisition and labelling of training data for extractor learning 4. Bridging between automated acquisition of SW data and their usage 5. Support for easy design of WIE applications from components.</p><p>In this paper, we focus on an ongoing demo application in the domain of bicycle product offers. Section 2 presents the core method: automated HTML annotation based on Hidden Markov Models. Section 3 extends the analysis of HTML code with that of images. Section 4 describes the composition of product offer instances with the help of a simple ontology. Section 5 outlines the architecture of the demo application and the subsequent usage of extracted data in an RDF repository. Finally, section 6 focuses on future work. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">Web Page Annotation Using HMMs</head><p>For extracting product entries from web catalogues, we built a Hidden Markov Model (HMM) tagger, which assigns a semantic tag to each token from a document. Tokens are either words, formatting tags or images. In our experiments, we evaluated the HMM performance on a diverse set of web pages, which come from different web sites and have heterogenous formattings.</p><p>We manually annotated a set of 100 HTML documents chosen from the Google Directory Sports-Cycling-BikeShops-Europe-UK-England. Each document contains from 1 to 50 bicycle offers, and each offer consists of at least the bicycle name and price. There are typically 3-4 documents from the same shop in the data. Annotations for 15 bicycle characteristics were made using SGML tags <ref type="foot" target="#foot_1">4</ref> . A sample annotated data is shown in Figure <ref type="figure" target="#fig_0">1</ref>.</p><p>To represent web documents, we employed extensive pre-processing. Similarly to <ref type="bibr" target="#b6">[7]</ref>, we transform each document into XHTML and perform canonicalisation of XML entities<ref type="foot" target="#foot_2">5</ref> . Certain HTML tags and tag groups are replaced by their generalisations <ref type="foot" target="#foot_3">6</ref> . Since only words and images can be extracted, we dispose of mark-up blocks that do not directly contain words or images.</p><p>HMMs are probabilistic finite state machines, which represent text as a sequence of tokens. An HMM consists of states which generate tokens, and of Fig. <ref type="figure">2</ref>. HMM architecture transitions between these states. States are associated with token generation probabilities, and transitions with transition probabilities. Both kinds of these probababilities are estimated from training data. For the purposes of information extraction, states are typically associated with semantic tags to be extracted. To annotate a document using a trained HMM, that document is assumed to have been generated by that HMM. The most probable state sequence is then found using the Viterbi algorithm <ref type="bibr" target="#b11">[12]</ref>.</p><p>The structure of our HMM is inspired by <ref type="bibr" target="#b5">[6]</ref> and is sketched in Figure <ref type="figure">2</ref>. Extracted slots are modelled using target states (denoted as T). Each target state is accompanied by two types of helper states responsible for representing the slot's characteristic context -the prefix and suffix states (P and S). Irrelevant tokens are modelled by a single background state (B). Contrary to <ref type="bibr" target="#b5">[6]</ref> and <ref type="bibr" target="#b16">[17]</ref>, which use independent HMMs trained for each slot separately, we train a single composite HMM capable of extracting all slots at once. Our model thus contains multiple target, prefix and suffix states. This approach, also used in <ref type="bibr" target="#b0">[1]</ref>, captures the ordering relations between nearby slots (e.g. product image often follows its name). We experimented also with other HMM architectures, with results presented in <ref type="bibr" target="#b15">[16]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3">Impact of Image Classification</head><p>For the purpose of extracting product images, we examined the impact of image information available to the HMM tagger. As a baseline approach, we measured the tagging performance when no image information was available for tagging. In this case, all images were represented by the same token and product pictures could only be distinguished based on the context in which they appeared.</p><p>In order to provide our tagger with more information, we built image classifiers to determine whether the extracted product is depicted in a particular image. We used the following features for classification: image dimensions, similarity to training product images, and whether there is more than one occurrence of the same image in the containing document.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Image dimensions</head><p>For our domain, we modelled images of bicycles using a 2-dimensional normal distribution, only estimated from positive training examples<ref type="foot" target="#foot_4">7</ref> . The dimensions x, y of a new image I are first evaluated using the estimated normal density N . The density value is then normalized to the interval (0,1) using the density's maximum value N max .</p><formula xml:id="formula_0">Dim(I) := N (x, y) N max (1)</formula><p>An image I is then classified as P os or N eg by comparing its Dim(I) score to a threshold T Dim . This threshold was estimated by minimizing the classification error rate on a separate heldout set of 150 images.</p><formula xml:id="formula_1">class(I) = P os if(Dim(I) ≥ T Dim ), N eg otherwise. (<label>2</label></formula><formula xml:id="formula_2">)</formula><p>Within our document collection, image dimensions appeared to be the best single predictor with the error rate of 6.6%. However, this is mainly due to our collection being limited to relevant product catalogues only. When dealing with more heterogeneous data, features describing the actual image content will become necessary.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Image similarity</head><p>We experimented with a latent semantic approach to measuring image similarity, described in <ref type="bibr" target="#b9">[10]</ref> and <ref type="bibr" target="#b10">[11]</ref>. This kind of image similarity has been applied to image retrieval from collections, where the task often is to find the most similar image to a query. We used this image-to-image similarity measure sim(I, J) to compute sim C (I), the similarity of an image I to a collection of images C. In our experiments, C contained the training bicycle pictures (positive examples only).</p><p>To compute sim C (I), we used the K nearest neighbor approach and averaged the similarities of the K most similar images from the collection.</p><formula xml:id="formula_3">sim C (I) = K best images J∈C sim(I, J) K (3)</formula><p>Experimentaly, we set K = 20, since lower values of K lead to a decrease in the similarity's robustness<ref type="foot" target="#foot_5">8</ref> and higher values did not bring further improvement. To build a classifier, a similarity threshold T Sim was estimated on a heldout set in the same way as for the dimension classifier above. The error rate of the classifier was 26.7% on our document collection.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3">Combined classifier</head><p>For the combined image classifier, we used the above described dimension score Dim(I), similarity score Sim(I) and a binary feature indicating whether the image occurs more than once in the document. We experimented with different classifiers available in the Weka<ref type="foot" target="#foot_6">9</ref> environment, and the best error rate<ref type="foot" target="#foot_7">10</ref> of 4.8% was achieved by the multilayer perceptron algorithm.</p><p>Results for all three classifiers are compared in Table <ref type="table" target="#tab_0">1</ref>. All results were measured using 10-fold cross-validation on a set of 1, 507 occurences of 999 unique images taken from our training documents. The first two algorithms used additional 150 heldout images to estimate their decision thresholds. The crossvalidation splitting was done at the level of documents, so that all images from a single document were either used for training or for testing. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4">Using Image Information for Extraction</head><p>To improve extraction results, we need to communicate the image classifier's results to the HMM tagger. Currently we do this simply by substituting each image occurence in a document by its class. Since these binary decisions would leave little room for the HMM tagger to fix incorrect classifications, we adapted the above binary classifiers to classify into 3 classes: P os, N eg, and U nk. In this way, the HMM tagger learns to classify the P os and N eg classes correspondingly, and the tagging of the U nk class depends more strongly on the context. To build the ternary versions of the dimension-and similarity-based classifiers, we introduced costs for the classifier's decisions. Each wrong decision was penalized by C Miss = 1 and the cost of each U nk decision was C Unk ∈ (0, 1). We set C Unk manually such that the classifier produced 5-10% of U nk decisions on the heldout set. While minimizing the sum of these costs on the heldout set, two thresholds were estimated for both the dimension-and similarity-based classifiers, delimiting their N eg, U nk and P os decisions.</p><p>For the combined ternary classifier, we achieved the best results with a decision list shown in Table <ref type="table" target="#tab_1">2</ref>. The list combines image occurence count with the results of the dimension-and similarity-based ternary classifiers, denoted as class 3  Dim and class 3 Sim respectively. We evaluated information extraction results with all three ternary classifiers and compared the results to the case where no image information was available. The new image information from the combined classifier lead to an increase of 19.1% points in picture precision and also to subtle improvements for other tags. Improvements in precision and recall for 3 chosen slots (product pictures, names and prices), measured on a per-token basis, are shown in Table <ref type="table" target="#tab_2">3</ref> for all three classifiers. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4">Ontology-Based Instance Composition</head><p>Semantic web is not about isolated tagged items but about complex and interrelated entities; we thus need to group the labels produced by automated annotation into instances. We currently use a simple sequential algorithm that exploits constraints defined in a tiny presentation ontology<ref type="foot" target="#foot_8">11</ref>  <ref type="bibr" target="#b8">[9]</ref>, which partly pertain to the generic domain (bike offers) and partly to the way of presenting information in web catalogues. Figure <ref type="figure" target="#fig_1">3</ref> shows an experimental presentation ontology containing the class 'Bike offer'. The utilized constraints are uniqueness, multiplicity and optionality of certain properties, the latter two indicated with the * and ? symbols, respectively <ref type="foot" target="#foot_9">12</ref> . In addition, 'sticky' properties (indicated with !) are distinguished: as soon as the value of sticky property is discovered on a page, it is filled to all objects extracted afterwards, until a new value is discovered for this property.</p><p>An annotated item added to the currently assembled (bike offer) instance unless it would cause inconsistency; otherwise, the current instance is saved and a new instance created to accommodate this item and the following ones. Despite acceptable performance on error-free, hand-annotated training data, where the algorithm correctly groups about 90% of names and prices, this 'baseline' approach achieves very poor results on automatically annotated data: on average, less than 50% of corresponding annotations are grouped properly, often for trivial reasons. The most critical problems are connected with missing or extra annotations, multiple different references to a single slot, and with transposed HTML tables.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5">Result Transformation, Storage And Retrieval</head><p>All components developed within the Rainbow project are wrapped as web services. The WIE component itself is currently being called by a simple control routine (written in Java), which also optionally calls other analysis tools: in the bicycle application, we so far experimented with URL-based navigation over the website, extraction of the content of selected META tags, and extraction of 'company profile sentences' from free text <ref type="foot" target="#foot_10">13</ref> . The results are transformed to RDF (with respect to a 'bicycle-offer RDFS ontology') and stored in a Sesame <ref type="bibr" target="#b1">[2]</ref> repository. An end-user search interface to this repository<ref type="foot" target="#foot_11">14</ref> is shown in Fig. <ref type="figure" target="#fig_2">4</ref>. It relies on a collection of query templates expressed in SeRQL (the native query language of Sesame) and enables a simple form of navigational retrieval <ref type="bibr" target="#b15">[16]</ref>. Most urgently, we need to replace the 'toy' implementation of ontology-based instance composition with a version reasonably robust on automatically annotated data. For some of the layout-oriented problems mentioned in section 4, partial solutions recently suggested in IE research (e.g. <ref type="bibr" target="#b2">[3,</ref><ref type="bibr" target="#b4">5]</ref>) could be reused. We also consider introducing HMMs even to this phase of extraction; a modified version of Viterbi algorithm supporting domain constraints (such as those in our presentation ontology) has already been described in <ref type="bibr" target="#b0">[1]</ref>. Another aspect worth investigation is the possibility of (semi-)automatic construction of presentation ontologies from the corresponding domain ontologies.</p><p>A critical bottleneck of ML-based IE methods (in particular of statistical ones) is the volume of labelled training data required. In our experiments with product catalogues, we noticed that the tagger often classifies most product entries correctly but misses a few product names that are very different from the training data. We developed a simple symbolic algorithm that identifies similar structural patterns in a document. For example, the HTML tag sequence &lt;td&gt; &lt;a&gt; &lt;font&gt; &lt;br/&gt; &lt;/font&gt; &lt;/a&gt; &lt;/td&gt; with arbitrary words in between appears 34 times in one of our training documents: the tagger successfully annotated 28 product names contained in these patterns between &lt;font&gt; and &lt;br/&gt;, but missed the remaining 6. In such cases, we could collect the remaining product names and use them to enrich the model's training data. By learning novel product names from these 'easy' pages, the model will learn to also recognise them in less structured documents 15 . We also plan to bootstrap the method with data picked from public resources related to product offering, following up with our earlier experiments with Open Directory headings and references <ref type="bibr" target="#b7">[8]</ref>.</p><p>Another important task is to replace hard-coded control routines with semiautomatically constructed, implementation-independent application models. A knowledge modelling framework has already been introduced for this purpose <ref type="bibr" target="#b13">[14]</ref>; currently we examine the adaptability of a PSM-based semantic web-service configuration technique in connection with this framework <ref type="bibr" target="#b14">[15]</ref>.</p><p>Eventually, we plan to associate our efforts with the popular Armadillo project <ref type="bibr" target="#b2">[3]</ref>, with which we share most of our abovementioned research interests.</p><p>The research is partially supported by grant no.201/03/1318 of the Grant Agency of the Czech Republic, "Intelligent analysis of the WWW content and structure".</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Fig. 1 .</head><label>1</label><figDesc>Fig. 1. Hand-annotated training data</figDesc><graphic coords="2,118.55,53.72,207.26,195.81" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Fig. 3 .</head><label>3</label><figDesc>Fig. 3. Bicycle offer presentation ontology</figDesc><graphic coords="7,136.01,53.72,172.64,140.40" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Fig. 4 .</head><label>4</label><figDesc>Fig. 4. End-user search interface</figDesc><graphic coords="8,83.62,53.75,277.07,216.30" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1 .</head><label>1</label><figDesc>Image classification results</figDesc><table><row><cell></cell><cell cols="3">Dimension Similarity Combined</cell></row><row><cell>Error rate (%)</cell><cell>6.6</cell><cell>26.7</cell><cell>4.8</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2 .</head><label>2</label><figDesc>Decision list for the combined ternary classifier</figDesc><table><row><cell cols="2">Order Rule</cell></row><row><cell>1</cell><cell>class(I) = Neg if(occurences(I) &gt; 1)</cell></row><row><cell>2 3 4</cell><cell>class(I) = P os if(class 3 Dim (I) = P os) class(I) = Unk if(class 3 Dim (I) = Unk) class(I) = Unk if(class 3 Sim (I) = P os)</cell></row><row><cell>5</cell><cell>class(I) = Neg</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3 .</head><label>3</label><figDesc>10-fold cross-validation results for selected tags over 100 documents</figDesc><table><row><cell>Tag</cell><cell cols="6">Precision Recall F-measure Precision Recall F-measure</cell></row><row><cell></cell><cell cols="3">No image information</cell><cell cols="3">Image similarity</cell></row><row><cell cols="2">Picture 67.8</cell><cell>87.1</cell><cell>76.2</cell><cell>78.5</cell><cell>87.3</cell><cell>82.7</cell></row><row><cell>Name</cell><cell>83.7</cell><cell>82.5</cell><cell>83.1</cell><cell>83.9</cell><cell>82.5</cell><cell>83.2</cell></row><row><cell>Price</cell><cell>83.7</cell><cell>94.4</cell><cell>88.8</cell><cell>84.0</cell><cell>94.4</cell><cell>88.9</cell></row><row><cell></cell><cell cols="3">Image Dimensions</cell><cell></cell><cell>Combined</cell><cell></cell></row><row><cell cols="2">Picture 85.6</cell><cell>88.4</cell><cell>87.0</cell><cell>86.9</cell><cell>89.1</cell><cell>88.0</cell></row><row><cell>Name</cell><cell>83.8</cell><cell>82.5</cell><cell>83.1</cell><cell>83.8</cell><cell>82.5</cell><cell>83.2</cell></row><row><cell>Price</cell><cell>84.0</cell><cell>94.4</cell><cell>88.9</cell><cell>84.0</cell><cell>94.4</cell><cell>88.9</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_0">http://rainbow.vse.cz K. Richta, V. Snášel, J. Pokorný (Eds.): Dateso 2005, pp. 84-93, ISBN 80-01-03204-3.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_1">The training data and a demo are available at http://rainbow.vse.cz.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_2">This step unifies different ways of writing the same characters in XML.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_3">Most tags are only represented using their names, disregarding any attributes. Oftenoccuring design patterns, such as add-to-basket buttons, are identified using several manually authored rules, and replaced by dedicated tokens.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="7" xml:id="foot_4">The positive examples comprise of all bicycle pictures found in the documents, not only those labeled as parts of bicycle offers. For information extraction, this increases the role of image context for correct tagging.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="8" xml:id="foot_5">With low values of K, simC(I) became too sensitive to individual images J with misleading values of sim(I, J).</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="9" xml:id="foot_6">http://www.cs.waikato.ac.nz/~ml</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="10" xml:id="foot_7">This error rate comes from 10-fold cross-validation without using heldout data.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="11" xml:id="foot_8">Similar to 'extraction ontologies' used by Embley<ref type="bibr" target="#b4">[5]</ref>.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="12" xml:id="foot_9">Although not shown in the example, we can also use e.g. property value types or regular expressions.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="13" xml:id="foot_10">These three approaches to website analysis, implemented independent of the bicycle demo application, are evaluated in<ref type="bibr" target="#b12">[13]</ref>.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="14" xml:id="foot_11">Available at http://rainbow.vse.cz:8000/sesame.</note>
		</body>
		<back>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Automatic segmentation of text into structured records</title>
		<author>
			<persName><forename type="first">V</forename><surname>Borkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Deshmukh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Sarawagi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">SIGMOD Conference</title>
				<imprint>
			<date type="published" when="2001">2001</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Sesame: An Architecture for Storing and Querying RDF and RDF Schema</title>
		<author>
			<persName><forename type="first">J</forename><surname>Broekstra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kampman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Van Harmelen</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. ISWC 2002</title>
				<meeting>ISWC 2002</meeting>
		<imprint>
			<publisher>Springer LNCS</publisher>
			<biblScope unit="volume">2342</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Learning to Harvest Information for the Semantic Web</title>
		<author>
			<persName><forename type="first">F</forename><surname>Ciravegna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Chapman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Dingli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wilks</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ESWS-04</title>
				<meeting><address><addrLine>Heraklion</addrLine></address></meeting>
		<imprint>
			<publisher>Springer LNCS</publisher>
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Mining Web Sites Using Unsupervised Adaptive Information Extraction</title>
		<author>
			<persName><forename type="first">A</forename><surname>Dingli</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Ciravegna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Guthrie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wilks</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">EACL</title>
				<imprint>
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Automatically extracting ontologically specified data from HTML tables with unknown structure</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">W</forename><surname>Embley</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Tao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">W</forename><surname>Liddle</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ER2002</title>
				<meeting><address><addrLine>Tampere</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2002">2002</date>
			<biblScope unit="page" from="322" to="337" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Information extraction with HMMs and shrinkage</title>
		<author>
			<persName><forename type="first">D</forename><surname>Freitag</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mccallum</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the AAAI-99 Workshop on Machine Learning for IE</title>
				<meeting>the AAAI-99 Workshop on Machine Learning for IE</meeting>
		<imprint>
			<date type="published" when="1999">1999</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Multilingual XML-Based Named Entity Recognition for E-Retail Domains</title>
		<author>
			<persName><forename type="first">C</forename><surname>Grover</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mcdonald</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Gearailt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karkaletsisy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Farmakiotouy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Samaritakisy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Petasis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Pazienza</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Vindigni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Vichotz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wolinskiz</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">LREC Conference</title>
				<meeting><address><addrLine>Las Palmas</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2002">2002</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Information Extraction and Ontology Learning Guided by Web Directory</title>
		<author>
			<persName><forename type="first">M</forename><surname>Kavalec</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ECAI Workshop on NLP and ML for ontology engineering</title>
				<meeting><address><addrLine>Lyon</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2002">2002</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Types and Roles of Ontologies in Web Information Extraction</title>
		<author>
			<persName><forename type="first">M</forename><surname>Labský</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Šváb</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ECML/PKDD04 Workshop on Knowledge Discovery and Ontologies</title>
				<meeting><address><addrLine>Pisa</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Latent semantic indexing for image retrieval systems</title>
		<author>
			<persName><forename type="first">P</forename><surname>Praks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dvorský</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Snášel</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the SIAM Conference on Applied Linear Algebra (LA03)</title>
				<meeting>the SIAM Conference on Applied Linear Algebra (LA03)<address><addrLine>Williamsburg, USA</addrLine></address></meeting>
		<imprint>
			<publisher>The College of William and Mary</publisher>
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Iris Recognition Using the SVD-Free Latent Semantic Indexing</title>
		<author>
			<persName><forename type="first">P</forename><surname>Praks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Machala</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Snášel</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">MDM/KDD 2004 -Fifth International Workshop on Multimedia Data Mining</title>
				<meeting><address><addrLine>Seattle, USA</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">A tutorial on hidden Markov models and selected applications in speech recognition</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">R</forename><surname>Rabiner</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE</title>
				<meeting>the IEEE</meeting>
		<imprint>
			<date type="published" when="1989">1989</date>
			<biblScope unit="volume">77</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Discovering Company Descriptions on the Web by Multiway Analysis</title>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Berka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kavalec</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kosek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Vávra</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Intelligent Information Processing and Web Mining, IIPWM&apos;03</title>
				<imprint>
			<publisher>Springer Verlag</publisher>
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Knowledge Modelling for Deductive Web Mining</title>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Labský</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Vacura</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. EKAW 2004</title>
				<meeting>EKAW 2004</meeting>
		<imprint>
			<publisher>LNCS</publisher>
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Web Service Composition for Deductive Web Mining: A Knowledge Modelling Approach</title>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ten Teije</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Vacura</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. Znalosti 2005</title>
				<meeting>Znalosti 2005<address><addrLine>VSB-TU Ostrava</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2005">2005</date>
		</imprint>
	</monogr>
	<note>to appear</note>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">RDF-Based Retrieval of Information Extracted from Web Product Catalogues</title>
		<author>
			<persName><forename type="first">O</forename><surname>Šváb</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Labský</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Svátek</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">SIGIR&apos;04 Semantic Web Workshop</title>
				<meeting><address><addrLine>Sheffield</addrLine></address></meeting>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">A Methodology for Semantically Annotating a Corpus Using a Domain Ontology and Machine Learning</title>
		<author>
			<persName><forename type="first">A</forename><surname>Valarakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Sigletos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karkaletsis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Paliouras</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">RANLP Conference</title>
				<meeting><address><addrLine>Borovets</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
