<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">TUKE System for MediaEval 2014 QUESST</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Jozef</forename><surname>Vavrek</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Peter</forename><surname>Viszlay</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Martin</forename><surname>Lojka</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Matúš</forename><surname>Pleva</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Jozef</forename><surname>Juhár</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">{jozef</forename><surname>Vavrek</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Matus</forename><surname>Pleva</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Jozef</forename><surname>Juhar}</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Tuke</forename><surname>Sk</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">Laboratory of Speech Technologies in Telecommunications</orgName>
								<orgName type="institution">Technical University of Košice</orgName>
								<address>
									<addrLine>Park Komenského 13</addrLine>
									<postCode>041 20</postCode>
									<settlement>Košice</settlement>
									<country key="SK">Slovakia</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">TUKE System for MediaEval 2014 QUESST</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">E83E9B974ECF1FBBE31B4B87B22B9C96</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-24T16:10+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Two approaches to QbE (Query-by-Example) retrieving system, proposed by the Technical University of Košice (TUKE) for the query by example search on speech task (QUESST), are presented in this paper. Our main interest was focused on building such QbE system, which is able to retrieve all given queries with and without using any external speech resources. Therefore we developed posteriorgram-based keyword matching system, which utilizes a novel weighted fast sequential variant of DTW (WFS-DTW) algorithm in order to detect occurrences of each query within the particular utterance file, using two GMM-based acoustic units modeling approaches. The first one, referred as low-resource approach, employs language-dependent phonetic decoders to convert queries and utterances into posteriorgrams. The second one, defined as zero-resource approach, implements combination of unsupervised segmentation and clustering techniques by using only provided utterance files.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">MOTIVATION</head><p>The motivation for developing our system was to assess the ability of proposed WFS-DTW algorithm to detect various spoken query terms by implementing low and zeroresource posteriorgram-based matching approach.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">WFS-DTW SEARCHING ALGORITHM</head><p>Searching algorithm for QUESST task follows the one used in our paper <ref type="bibr">[8]</ref>. Proposed solution is a modification of segmental DTW algorithm we applied in spoken web search task last year <ref type="bibr" target="#b7">[7]</ref>. There are three main contributions to this algorithm: 1) one step forward moving strategy, when each DTW search is carried out sequentially, block by block, with size equal to the length of query; 2) linear time-aligned accumulated distance for speeding up sequential DTW without considerable loss in retrieving performance; 3) optimization of global minimum for set of alignment paths by implementing weighted cumulative distance (WCD) parameter.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">LOW-RESOURCE APPROACH</head><p>The low-resource approach includes 4 language-dependent subsystems, each represented by GMM-based acoustic model. The acoustic models were trained previously using four databases: 2× Speechdat (Slovak, 66h and Czech, 89h) <ref type="bibr" target="#b6">[6]</ref>,</p><p>Copyright is held by the author/owner(s).</p><p>MediaEval 2014 Workshop, October 16-17, 2014, Barcelona, Spain Slovak ParDat1 (40h) <ref type="bibr" target="#b3">[3]</ref> and English TIMIT (10h) <ref type="bibr" target="#b4">[4]</ref>.</p><p>The well-trained models were intended to generate timealigned and labelled segments for each utterance through Viterbi decoding. The phonetic decoder employed a phonelevel vocabulary and a phone network. We found that the phoneme insertion log probability p in Viterbi segmentation has significant impact to time-alignment. Since the best results were obtained with p = 0, we used this value in the whole setup. The time-alignments were used to train a new GMM-based acoustic model using the development data. It means that each language-dependent model was replaced by its refined version, which was finally used to generate the posteriorgrams for utterances and queries.</p><p>Note that we used 39-dimensional MFCC (Mel-Frequency Cepstral Coefficients) features for Viterbi segmentation and GMM training. In low-resource approach we did not need any voice activity detector (VAD) because the silent parts of the audio stream were identified in the Viterbi segmentation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">ZERO-RESOURCE APPROACH</head><p>In keeping with the zero-resource approach, we did not assume any prior knowledge of the acoustic units or pronunciation lexicon. In order to train the acoustic models, it was firstly necessary to identify the acoustic speech units in the audio data automatically. In this work, we utilized four different zero-resource approaches to address this problem.</p><p>Type 1: This one uses a PCA-based VAD to discriminate the voice active segments from the silent ones <ref type="bibr">[8]</ref>. The initial feature selection, based on simple PCA (principal component analysis) <ref type="bibr" target="#b5">[5]</ref>, is carried out after extracting first 13 MFCCs. Only those speech active feature vectors are selected, whose variance achieves values greater than 90% at the first principal component. Then, K-means clustering with K = 75 clusters and correlation distance metric is computed on the reduced data. The clustering starts by selecting K points uniformly. Finally, speech segmentation is performed by computing the squared Euclidean distance between feature vectors and K mean vectors, where the label of the mean vector with minimum distance is assigned in collaboration with VAD.</p><p>Type 2: Type 2 approach comes directly out from the Type 1 and is further extended by Viterbi segmentation and new GMM training. These two steps are identical to those already described in Section 3. The main difference is that the acoustic model from the Type 1 is used to generate the time-alignments through Viterbi segmentation.</p><p>Type 3: The third approach is based on the well-known flat start training procedure <ref type="bibr" target="#b9">[9]</ref>. It does not need any seg- Therefore, an alternative GMM initialization strategy is applied, where all phone models are initialized identically with state means and variances equal to the global mean and variance. The phone models are then moved straight to embedded training and simultaneously updated and expanded to the higher GMs (Gaussian Mixtures) <ref type="bibr" target="#b9">[9]</ref>. The key element in flat start training is the phone-level transcription, obtained from the phone-based recognition using the acoustic model acquired from the first type zero-resource approach. Type 4: Type 4 approach implements GMM-based segmentation and ergodic HMM (EHMM) training. Firstly, an unsupervised GMM training is performed on whole database, where each acoustic unit is represented by one GM. Each GM is then associated with one of the 64 states in EHMM and new GMs for each acoustic unit are trained iteratively.</p><p>Note that we used conventional 39-dimensional MFCCs for each zero-resource processing (except the Type 1). We did not use any VAD here (except the Type 1) because the &lt;sil&gt; labels were available from the Viterbi segmentation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">POST-PROCESSING: SCORE NORMAL-IZATION AND FUSION</head><p>Score parameter was represented by WCD, normalized by scaling factor 0/1, similarly as we used in <ref type="bibr">[8]</ref>. This step helped us to unified score ranges for the first 500 detection candidates per each query. Then the score fusion for four different subsystems was carried out, employing a simple max-score merging strategy, similarly as Anguera et al. did in <ref type="bibr" target="#b1">[1]</ref>. Detection candidates from each individual subsystem were merged together, keeping the one with the highest score in case of overlap. Merged candidates for each query were subsequently normalized by z-normalization and aligned according to the score value. The final set was obtained by keeping first 45-150 candidates, according to the length of query (the shorter query the lower number of candidates).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">RESULTS AND CONCLUSION</head><p>We submitted four runs obtained from low-resource (primary) and zero-resource (general) systems for QUESST 2014 task <ref type="bibr" target="#b2">[2]</ref>. The primary systems employ language-dependent acoustic modeling using Viterbi segmentation with 128 GMs (ParDat1, TIMIT) and 256 GMs (Speechdat SK, CZ). The general systems use 32 GMs for Type 1,2,3 and 64 GM for Type 4. The best-one-win strategy was used at first runs (on time). Thus, only the subsystem with best performance was submitted, namely p-low using Speechdat SK and g- zero Type 2 subsystem. Late submissions include maxscore merging fusion of four subsystems for both primary and general approaches. Results in Tab. 1 show that there are still big differences in performance between p-low and g-zero approaches, even if the score fusion technique was applied. Even more, there is also considerable gap between act and min Cnxe despite the fact that the act and max T W V are perfectly calibrated. Therefore, an improved calibration/fusion models based on affine transformation and linear-regression will be investigated in the future.</p><p>The indexing was done using 2xIBM x3650 (Intel E5530 @ 2.4 GHz, 8 cores), 28 GB RAM, under Debian OS. Searching algorithm was running on 52xIBM dx360 M3 cluster (Intel E5645 @ 2.4GHz, 624 cores), 48 GB RAM per node, running on Scientific Linux 6 and Torque (see Tab. 2).</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1 :</head><label>1</label><figDesc>Evaluation of primary low-resource (p-low) and general zero-resource (g-zero) systems (* indicates late submission)</figDesc><table><row><cell></cell><cell></cell><cell>eval</cell><cell></cell><cell>dev</cell></row><row><cell>system</cell><cell>Cnxe</cell><cell>TWV</cell><cell>Cnxe</cell><cell>TWV</cell></row><row><cell></cell><cell>(act/min)</cell><cell>(act/max)</cell><cell>(act/min)</cell><cell>(act/max)</cell></row><row><cell>p-low</cell><cell cols="4">0.959/0.891 0.154/0.154 0.960/0.892 0.161/0.162</cell></row><row><cell>g-zero</cell><cell cols="4">0.973/0.934 0.075/0.077 0.974/0.934 0.091/0.091</cell></row><row><cell>p-low*</cell><cell cols="4">0.947/0.853 0.168/0.169 0.948/0.854 0.191/0.191</cell></row><row><cell>g-zero*</cell><cell cols="4">0.970/0.921 0.102/0.103 0.971/0.922 0.106/0.107</cell></row><row><cell cols="5">mentation or clustering because the utterances are uniformly</cell></row><row><cell cols="5">segmented using the Baum-Welch embedded re-estimation.</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2 :</head><label>2</label><figDesc>Processing resources measures</figDesc><table><row><cell>system</cell><cell>ISF</cell><cell>SSF</cell><cell cols="2">P M UI P M US</cell><cell>PL</cell></row><row><cell cols="3">p-low (dev) 0.61 0.0034</cell><cell>0.05</cell><cell>2.46</cell><cell>0.0106</cell></row><row><cell cols="3">g-zero (dev) 1.5 0.0042</cell><cell>1.4</cell><cell>3.92</cell><cell>0.225</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.">ACKNOWLEDGMENTS</head><p>This publication is the result of the Project implementation: University Science Park TECHNICOM for Innovation Applications Supported by Knowledge Technology, ITMS: 26220220182, supported by the Research &amp; Development Operational Programme funded by the ERDF (100%).</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<monogr>
		<title/>
		<author>
			<persName><surname>References</surname></persName>
		</author>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">The Telefonica Research Spoken Web Search System for MediaEval</title>
		<author>
			<persName><forename type="first">X</forename><surname>Anguera</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes Proc. of the MediaEval 2013</title>
				<imprint>
			<date type="published" when="2013">2013. 2013</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Query by Example Search on Speech at Mediaeval</title>
		<author>
			<persName><forename type="first">X</forename><surname>Anguera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Metze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Buzo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Szoke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">J</forename><surname>Rodriguez-Fuentes</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes Proc. of the MediaEval 2014 Workshop</title>
				<meeting><address><addrLine>Barcelona, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2014-10">2014. October 2014</date>
			<biblScope unit="page" from="16" to="17" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Rule-based Triphone Mapping for Acoustic Modeling in Automatic Speech Recognition</title>
		<author>
			<persName><forename type="first">S</forename><surname>Darjaa</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. of the 14th Intl. Conf. on Text, Speech and Dialogue, TSD&apos;11</title>
				<meeting>of the 14th Intl. Conf. on Text, Speech and Dialogue, TSD&apos;11</meeting>
		<imprint>
			<date type="published" when="2011">2011</date>
			<biblScope unit="page" from="268" to="275" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">TIMIT Acoustic-Phonetic Continuous Speech Corpus</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">S</forename><surname>Garofolo</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Linguistic Data Consortium</title>
				<meeting><address><addrLine>Philadelphia</addrLine></address></meeting>
		<imprint>
			<date type="published" when="1993">1993</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Linear Feature Transformations in Slovak Phoneme-Based Continuous Speech Recognition</title>
		<author>
			<persName><forename type="first">J</forename><surname>Juhár</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Viszlay</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Modern Speech Recognition Approaches with Case Studies</title>
				<imprint>
			<publisher>InTech Open Access</publisher>
			<date type="published" when="2012">2012</date>
			<biblScope unit="page" from="131" to="154" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">SpeechDat-E: five eastern european speech databases for voice-operated teleservices completed</title>
		<author>
			<persName><forename type="first">H</forename><surname>Van Den</surname></persName>
		</author>
		<author>
			<persName><surname>Heuvel</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. of INTERSPEECH</title>
				<meeting>of INTERSPEECH</meeting>
		<imprint>
			<date type="published" when="2001">2001</date>
			<biblScope unit="page" from="2059" to="2062" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">TUKE at MediaEval 2013 Spoken Web Search Task</title>
		<author>
			<persName><forename type="first">J</forename><surname>Vavrek</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes Proc. of the MediaEval 2013</title>
				<imprint>
			<date type="published" when="2013">2013</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Query-by-Example Retrieval via Fast Sequential Dynamic Time Warping Algorithm</title>
		<author>
			<persName><forename type="first">J</forename><surname>Vavrek</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">TSP 2014</title>
				<meeting><address><addrLine>Berlin, DE</addrLine></address></meeting>
		<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2014-07">July 2014</date>
			<biblScope unit="page" from="469" to="473" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<title level="m" type="main">The HTK Book (for HTK Version 3</title>
		<author>
			<persName><forename type="first">S</forename><surname>Young</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2006">2006</date>
			<biblScope unit="volume">4</biblScope>
		</imprint>
		<respStmt>
			<orgName>Cambridge University</orgName>
		</respStmt>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
