<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Transfer Learning and Data Augmentation Techniques applied to Speech Emotion Recognition in SE&amp;R 2022</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Caroline</forename><surname>Alves</surname></persName>
							<email>carolalves@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Bruno</forename><surname>Carlotto</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">Instituto de Ciências Matemáticas e de Computação</orgName>
								<orgName type="institution">ICMC-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Bruno</forename><surname>Dias</surname></persName>
							<email>brunoadiaspapa1@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Anátale</forename><surname>Garcia</surname></persName>
							<email>anatale.garcia@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Bruno</forename><surname>Gianesi</surname></persName>
							<email>brunogianesi@usp.br</email>
							<affiliation key="aff2">
								<orgName type="department">Engenharia Mecatrônica</orgName>
								<orgName type="institution">EESC-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Renan</forename><surname>Izaias</surname></persName>
							<email>renan.izaias@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Maria</forename><surname>Luiza De Morais</surname></persName>
							<email>marialuizamorais@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Paula</forename><surname>De Oliveira</surname></persName>
							<email>paulamarindeoliveira@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Vinícius</forename><forename type="middle">G</forename><surname>Santos</surname></persName>
							<email>vinicius.santos@alumni.usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Rafael</forename><surname>Sicoli</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Flaviane</forename><forename type="middle">R</forename><surname>Fernandes Svartman</surname></persName>
							<email>flavianesvartman@usp.br</email>
							<affiliation key="aff0">
								<orgName type="department">Departamento de Letras Clássicas e Vernáculas</orgName>
								<orgName type="institution">FFLCH-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sandra</forename><surname>Aluisio</surname></persName>
							<email>sandra@icmc.usp.br</email>
							<affiliation key="aff1">
								<orgName type="department">Instituto de Ciências Matemáticas e de Computação</orgName>
								<orgName type="institution">ICMC-USP</orgName>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sidney</forename><surname>Leal</surname></persName>
							<email>sidleal@gmail.com</email>
							<affiliation key="aff1">
								<orgName type="department">Instituto de Ciências Matemáticas e de Computação</orgName>
								<orgName type="institution">ICMC-USP</orgName>
							</affiliation>
						</author>
						<title level="a" type="main">Transfer Learning and Data Augmentation Techniques applied to Speech Emotion Recognition in SE&amp;R 2022</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">85169C784A387D7749978856F0DE7B73</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-23T22:20+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Deep Learning</term>
					<term>Transfer Learning</term>
					<term>Data Augmentation</term>
					<term>Speech Emotion Recognition</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>In this work, our team ICMC-EESC-FFLCH explores several techniques to address data scarcity and imbalance in SE&amp;R 2022 task dedicated to speech emotion recognition (SER). We evaluate two types of transfer learning models: (i) Multi-task learning, in which two tasks are learned simultaneously, and (ii) Sequential transfer learning where the tasks are learned sequentially. In both models, the auxiliary task is genre classification from speech, using a large dataset with almost 145 hours of speech signals. As for the techniques to balance the training data, we have used the SMOTE (Synthetic Minority Over-sampling Technique) and Praat's Change gender command to over-sampling minority classes. Our Sequential transfer learning architecture, using the two baselines feature sets provided by the shared-task (prosodic audio features and embeddings generated by the Wav2Vec 2.0 model) and the two approaches to balance the training dataset reaches satisfactory performance with a 0.5353 F1-macro, surpassing the prosodic features baseline. On the other hand, our multi-task learning approach using the two baseline features sets and the SMOTE approach to balance the training dataset reaches only a 0.5301 F1-macro. Finally, our worst result is 0.469 F1-macro, obtained with the feature selection experiment (29 prosodic features manually chosen from the literature), using our multi-task learning architecture with the two approaches to balance the training dataset.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>According to <ref type="bibr" target="#b0">[1]</ref>, speech emotion recognition (SER) systems are composed of methods, namely feature extraction and emotion classification, that process and classify speech signals to detect the embedded emotions of speech. They can also include a preprocessing step before the extraction of the features used to normalize the signals, for example, the use of noise reduction techniques. Emotion classes depend on labeled data of the dataset used to create the model; these datasets can be of three types: acted, elicited or natural. While most of the natural datasets are from spontaneous speech recorded in noisy environments, acted speech databases are recorded by professional actors in sound-proof studios. Elicited speech datasets are created by placing speakers in a simulated emotional situation that can stimulate various emotions and can be close to real ones. It is important to notice that, the definition of emotion is an open problem in psychology and there are two models being used in SER systems: discrete and dimensional emotional models. The first one is based on the six primary and culturally independent categories of basic emotions <ref type="bibr" target="#b1">[2]</ref>: sadness, happiness, fear, anger, disgust, and surprise, where other emotions are obtained by the combination of the basic ones. Most of the existing SER systems focus on all these basic emotional categories, sometimes including the neutral category (see, for example, <ref type="bibr" target="#b2">[3]</ref>, a study focusing on Portuguese language), or in a small group of those emotions <ref type="foot" target="#foot_0">1</ref> . The second one, the dimensional emotional model, uses a small number of latent dimensions to define emotions such as: valence, arousal/excitation, control/power. In this model, emotions are not independent of each other, instead, they are analogous to each other in a systematic way. <ref type="bibr" target="#b4">[5]</ref> support of the thesis that the three dimensions of pleasure-displeasure (valence), arousalnonarousal (excitation), and dominance-submissiveness (power/control) are both necessary and sufficient to describe a large variety of emotional states. Specifically, valence describes whether an emotion is positive or negative, and it ranges between unpleasant and pleasant; excitation defines the strength of the felt emotion, ranging from boredom to frantic excitement; and the dimension of control/power refers to the seeming strength of the person (between weak and strong). For example, the third dimension differentiates anger from fear by considering the strength or weakness of the person, respectively; however, as the surprise emotion may have positive or negative valence depending on the context, it is difficult to categorize.</p><p>Whereas most studies on SER deal with simulated, noise-free datasets recorded in sound-proof studios <ref type="bibr" target="#b3">[4]</ref>, SE&amp;R 2022 brings a small dataset of approximately 50 minutes, with 625 audio segments (training dataset) from the C-ORAL-BRASIL I corpus <ref type="bibr" target="#b5">[6]</ref>, consisting of audio segments representing Brazilian Portuguese informal spontaneous speech, recorded in natural contexts and noisy environments.</p><p>The two baseline feature sets (prosodic audio features for emotion classification <ref type="bibr" target="#b6">[7,</ref><ref type="bibr" target="#b7">8]</ref> and embeddings generated by the Wav2Vec 2.0 model <ref type="bibr" target="#b8">[9]</ref>) made available for SE&amp;R 2022 were used in this work. Feature selection was also evaluated, focusing on four small prosodic features sets, manually chosen, with 29, 19, 10, and 8 features, taken from pitch, intensity, and spectrum groups of features. While the first SER systems used machine learning methods with a careful feature engineering (see several examples in <ref type="bibr" target="#b9">[10]</ref>), recent approaches use ensembles to learn hybrid acoustic features <ref type="bibr" target="#b10">[11]</ref>, and deep learning architectures, such as multi-task learning <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13]</ref>, attention mechanisms <ref type="bibr" target="#b13">[14]</ref>, and transfer learning approaches <ref type="bibr" target="#b14">[15]</ref>.</p><p>Our contribution to SE&amp;R 2022 explores two architectures based on deep neural networks (DNN) aiming at detecting Speech Emotion Recognition in Portuguese audio files. Our proposal evaluates two types of inductive transfer learning: multi-task <ref type="bibr" target="#b15">[16]</ref> and sequential transfer learning <ref type="bibr" target="#b16">[17]</ref>. In both models, the auxiliary task is genre classification from speech 2 . Since DNN-based classifiers have a generalization error problem when trained with limited datasets, we explore two different data augmentation techniques aimed to balance the training data. We have used the SMOTE <ref type="bibr" target="#b17">[18]</ref> to create synthetic data for the minority classes and Praat's <ref type="bibr" target="#b18">[19]</ref> Change gender command to manipulate the acoustic features in order to create new synthetic data based on the pre-existing ones. The Jupyter notebooks and characterization of the training dataset are publicly available at https://github.com/BrunoBaldissera/ser-transfer.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Experimental Framework</head><p>First, we present the original dataset for the main task and the dataset used for the auxiliary task of genre classification from speech in both inductive transfer learning architectures (Section 2.1), noting that the original dataset is unbalanced. Therefore, we applied two techniques for data augmentation (Section 2.2). Section 2.3 presents the feature sets we explored in our linguistically motivated selection of prosodic features, based on the literature. Finally, Section 2.4 presents our multi-task and sequential transfer learning architectures.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Datasets</head><p>2.1.1 Primary Task Dataset: official dataset of SE&amp;R shared-task on SER. In the SE&amp;R 2022 shared-task on SER, the audio segments are labeled in three classes: neutral, non-neutral female, and non-neutral male. The neutral class is the majority class (491 samples) and is used to label audio segments with no well-defined emotional state while the non-neutral classes label segments (89 non-neutral-female and 45 non-neutral-male) associated with one of the primary emotional states in the speaker's speech. In order to better understand the training dataset used in this study, seven annotators from our group pursued a qualitative analysis of the dataset. They labeled every audio in the training set with "yes" (meaning presence) or "no" (meaning absence) according to the following categories:</p><p>• Noise: any sort of noise not related with the primary voice(s) <ref type="foot" target="#foot_2">3</ref> , e.g., background chatting, microphone hissing noise, music, children voices, etc.; • Voice overlapping: periods in which there were two primary voices speaking at the exact same moment; • Different gender: the presence of more than one perceived gender in the primary voices of the same audio; and • Voices in sequence: the presence of more than one primary voice in the same audio, but without direct overlapping between them.</p><p>Our evaluation is summarized in Figures <ref type="figure" target="#fig_1">1a and 1b</ref>. As we can see, there is a lot of noisy audio. Although noise is not a problem for the auxiliary task (Audio Genre Classification) <ref type="bibr" target="#b19">[20]</ref> of the neural architectures, only an error analysis can identify possible problems for the SER task as a whole. Also, two complex problems were found: high overlapping rate of voices and audios with different genres, which we believe may have an impact on the classification of the 2 non-neutral classes (male and female). Of the 26 non-neutral audios that have different gender,   <ref type="figure" target="#fig_1">1a and 1b</ref> show a characterization of the training dataset, presenting the number of audios with noise, primary overlapping voices, primary voices with different genders, primary voices in sequence, for both types of classes (neutral and non-neutral) audios.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.2">Auxiliary</head><p>Task Dataset: CETUC. The task of classifying gender based on voice identifies automatically a voice as male or female, based on the audio features. The gender identification of a given speaker was implemented in an undergrad project of one of the authors <ref type="bibr" target="#b19">[20]</ref>, to evaluate machine learning methods, such as decision trees, random forest, gradient boosting, support vector machine, multi-layer perceptron and logistic regression, and to compare the use of distinct features and models applied on different datasets. In addition, the study also assessed whether the models generalize to other contexts, such as other languages (English) or noisy environments, when trained on CETUC dataset <ref type="bibr" target="#b20">[21]</ref> that was recorded in a controlled environment.</p><p>The best performance method (gradient boosting) was trained using the large dataset CETUC, with almost 145 hours of speech signals spoken by 50 male and 50 female speakers <ref type="foot" target="#foot_3">4</ref> , each one pronouncing 1,000 phonetically balanced sentences selected from the CETEN-Folha corpus <ref type="foot" target="#foot_4">5</ref> . The best performance model used three sets of features from audio signals, totalling 44 features: (i) 12 statistics extracted from the highest frequency value, after applying the Fourier transform on the audios, divided into time windows of 0.2 seconds, (ii) the fundamental frequency (F0) statistics ( <ref type="formula">12</ref>) and (iii) 20 MFCCs (Mel-Frequency Cepstral Coefficients), and reached an accuracy of 94,1%. This model was able to generalize well to audios with noise; it reached an accuracy of 90,8% on the testset MLS <ref type="bibr" target="#b21">[22]</ref> with noise.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Data Augmentation Approaches: SMOTE and Praat's Change Gender</head><p>We used two approaches to balance the training dataset applied specifically on audios of non-neutral male and non-neutral female classes: SMOTE <ref type="bibr" target="#b17">[18]</ref> and Praat's Change gender command <ref type="bibr" target="#b18">[19]</ref>.</p><p>It is suggested by the authors of the original SMOTE paper that previously performing a random under-sampling of the majority class followed by over-sampling the minority class tends to yield good results. However, in this work, we have only over-sampled the minority classes, following the work by <ref type="bibr" target="#b22">[23]</ref>, and using the technique in its simplest implementation. Nonetheless, as the synthesis of new data with SMOTE uses a linear combination of randomly chosen neighbors of the underrepresented instances in the feature space rather than just replicating the given instances, we gave more focus to this augmentation approach in place of the simple oversampling (even though a number of such tests was performed). We have used the Python imbalanced-learn package <ref type="bibr" target="#b23">[24]</ref>; all the parameters were set as default.</p><p>Praat's Change gender command allow us to manipulate the acoustic features to create new synthetic data based on the preexisting ones. Through this method, we can change the perceived gender of a given voice into the opposite gender. The second method for data augmentation consists in the use of the algorithm for gender conversion available in the software for acoustic analysis Praat. A total of 133 files were used, 45 of them containing male voices, then converted to female ones, and 88 containing female voices, then converted to male ones<ref type="foot" target="#foot_5">6</ref> . The task was undertaken by five annotators and had two phases: attribution of parameters for conversion and quality evaluation of the generated voice. In the quality assessment phase, the annotators changed the previously established default values in order to obtain voices that they judged the most natural as possible. For the conversion process, we first defined the frequency range in which the algorithm parameters were applied, using the values already predefined by the program, with the minimum pitch value being 75 Hz, and the maximum 600 Hz. The algorithm contains four parameters, described below, that can be used for gender conversion, from which we have only used the first two:</p><p>• Formant shift ratio (default value is 1.0) determines the ratio for proportionally modifying the value of formants, i.e., the sound frequency values at which the highest peaks of intensity occur, resulting from the resonance of the sound wave in its path through the vocal tract, from its production in the vocal folds until the moment of emission. The factor valued 1.0 means there is no alteration. For the task, we established the factor value 1.1 as the standard for male-to-female conversion, used in 30 of 45 files, and 0.8 for female-to-male conversion, used in 72 of 88 files. As mentioned above, these values were altered in some files in order to maintain a perceived natural quality of the converted voice: for the other 15 male-to-female converted files, factors between 1.15 or 1.2 were used, and for the other 16 female-to-male converted files, values between 0.85 or 0.9. • New pitch median (default value is 0.0): a new median for the pitch values is established for each file, which, in turn, is used to compose a factor expressed by the ratio between this new median and the original median pitch. This factor is then used by the algorithm to multiply the original pitch values to obtain new values. In this metric, the value 0.0 represents the default setting, yielding the factor 1.0, which means no alteration. We established as standard values for this assignment the frequency measurement of 300 Hz for male-to-female conversion, for 35 of 45 files, and 140 Hz for female-to-male conversion, for 58 of 88 files. These values were also altered in some files to achieve a convincing result: for male-to-female conversion, values between 250 Hz and 380 Hz were used for the other 10 files, and for female-to-male conversion, values between 80 Hz and 260 Hz were used for the other 30 files. • Pitch range factor (default value: 1.0) provides for an additional modification in pitch by an extra scaling of the values around the new pitch median, obtained in the previous step.</p><p>A factor of 1.0 means that no additional pitch modification will occur, and a factor valued as 0.0 monotonizes the new sound to the new pitch median. Considering the essential goal of the project, the default value was kept and no modifications for the pitch range were provided. • Duration factor (default value: 1.0) establishes a factor used for lengthening the sound file. For a factor valued less than 1.0, the resulting sound will be shorter than the original, and a value higher than 3.0 will not work. The default value provided by the software was also maintained, as a change in the duration of the sound is deemed as unnecessary for the development of the task.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Selection of Prosodic Features for SER</head><p>We grouped the 56 prosodic audio features (one of the baseline feature sets) into six classes<ref type="foot" target="#foot_6">7</ref> in order to select those strongly related to the classes defined for SE&amp;R 2022 and evaluate them separately and conjoined: (1) related to voice quality (13 features), including local_jitter and local_shimmer, those from Harmonics-to-Noise Ratio (HNR) and those from Glottal-to-Noise Ratio (GNE); (2) related to intensity (9 features), for example, min_intensity, max_intensity; (3) related to F0 (pitch) (10 features), for example, mean_pitch, stddev_pitch; (4) related to spectrum (10 features), for example, skewness_spectrum, kurtosis_spectrum; (5) related to formants (10 features), for example, formant_dispersion, average_formant; (6) related to vocal tract length (VTL) (4 features), for example, fitch_vtl, vtl_delta_f.</p><p>The groups related to intensity (first 9 features), F0 (from 10 to 19), and spectrum (last 10 features), respectively shown in Table <ref type="table" target="#tab_0">1</ref>, were chosen for our feature selection experiment which included the training of 7 multi-task and 5 sequential classifiers, totalling 12 experiments, shown in Section 3.3. The classifiers used 10 (related to spectrum), 19 (intensity and F0) and 29 (spectrum, intensity, and F0) features and also a subset of 8 features, shown in bold in Table <ref type="table" target="#tab_0">1</ref>.</p><p>According to <ref type="bibr" target="#b24">[25]</ref>, energy, pitch, and time are the three perceptual dimensions on which most vocal indicators of various emotions are based. Therefore, the class of acoustic parameters related to F0, intensity, and spectrum were selected because they are reported in the literature as potential correlates of the vocal expression of emotions <ref type="bibr" target="#b24">[25,</ref><ref type="bibr" target="#b25">26,</ref><ref type="bibr" target="#b26">27,</ref><ref type="bibr" target="#b27">28]</ref>. F0 (fundamental frequency) is an acoustic correlate of the rate of vocal cords vibration, that is, the number of times a sound wave produced by the vocal cords is repeated during a given period of time. F0 is perceived as the pitch of the voice, and the range of values for this frequency varies according to sex and age <ref type="foot" target="#foot_7">8</ref> . In turn, sound intensity corresponds to the variations in the air pressure of a sound wave and is perceived as the loudness of a sound. Loudness and pitch are, in fact, elementary domains of the auditory signal and changes in sound intensity and F0 seem to be relevant to emotion analysis: higher and wider pitch ranges and higher sound intensity are typically associated with high arousal emotions (e.g., fear, anger, joy) compared to neutral speech, while lower and narrower pitch ranges and lower sound intensity are more associated with low arousal emotions (e.g., sadness, boredom, calmness) <ref type="bibr" target="#b24">[25,</ref><ref type="bibr" target="#b29">30,</ref><ref type="bibr" target="#b30">31,</ref><ref type="bibr" target="#b31">32,</ref><ref type="bibr" target="#b32">33]</ref>. Studies have also shown that emotion affects the distribution of spectral energy across the range of sound frequencies: for example, stronger energy in higher frequency bands is usually associated with high arousal emotions, while weaker energy in the same band is more associated with low arousal emotions [31] 9 .</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4.">Neural Architectures: multi-task and sequential transfer learning</head><p>Transfer Learning is a machine learning approach that transfers weights trained in one task, domain, or language to a different one, with the aim of improving the learning generalization <ref type="bibr" target="#b16">[17]</ref>. In this work, two Transfer Learning techniques were used: Multi-task and Sequential Transfer Learning. In the first one, the training of the two tasks is performed simultaneously, sharing a layer of weights between the two tasks <ref type="bibr" target="#b15">[16]</ref>. In the second, the weights trained in the first task are transferred to the second, sequentially <ref type="bibr" target="#b33">[34]</ref>. Figure <ref type="figure" target="#fig_3">2</ref> presents the two architectures.</p><p>For the Multi-task architecture, two MultiLayer Perceptron (MLP) neural networks were used, with 4 layers each, sharing a common layer with 100 neurons. The first one focused on the binary gender prediction task, using the CETUC dataset, with 44 neurons in the input layer and one neuron in the output layer. The second (main task), focused on the prediction of the three 9 Many of these studies used speech audios recorded in sound-proof booths with controlled scenarios. Spontaneous speech recorded in natural contexts and noisy environments like SER shared-task dataset interferes with extracted features results, as the acoustic signal is affected by sound sources competing with the target signal, the performance of pitch detection algorithms degrades as the noise level increases, and even the speech signal energy depends on the distance and position between the speaker's mouth and microphone. Therefore, in future work, at least methods for noise incorporation/reduction will be explored to assess the impact of noise on data.  SER classes, with the number of neurons in the input layer varying from 8 to 824 (according to the features used) and three neurons in the output layer. Both use a previous layer of 10 neurons before the common layer. For the Sequential architecture, two MLP's were also used, but they were trained sequentially. The first for the binary gender prediction task with 44 neurons in the input, a hidden layer of 30 neurons and one neuron in the output. The hidden layer was then frozen and transferred to the second MLP, whose input layer ranged from 73 to 868 (according to the features used) and with three neurons in the output layer (one for each class) of the second task. The frozen layer acted by predicting the gender of the samples (auxiliary task) and passing this prediction as a new internal feature to a layer of 5 neurons before the output (for models with more features this layer was changed to 10 neurons).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Experiments</head><p>All the 26 models described in Sections 3.1, 3.2 and 3.3 were trained using a batch size of 100 and 300 epochs. <ref type="table">2</ref> presents the results, in crescent order of F1-macro values, for the experiments with the sequential learning architecture. <ref type="table">3</ref> presents the results, in crescent order of F1-macro values, for the experiments with the multi-task learning architecture.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Sequential Learning Results. Table</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Multi-task Learning Results. Table</head></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head></head><label></label><figDesc>24 have voices overlapping and only 2 have voices in sequence. Of the 56 neutral audios that have different gender, 53 have voices overlapping and only 3 have voices in sequence.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 1 :</head><label>1</label><figDesc>Figure1: A qualitative analysis of the SER dataset performed by our team. Figures1a and 1bshow a characterization of the training dataset, presenting the number of audios with noise, primary overlapping voices, primary voices with different genders, primary voices in sequence, for both types of classes (neutral and non-neutral) audios.</figDesc><graphic coords="4,96.72,133.21,198.42,122.69" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Transfer Learning architectures: a) Multi-task: 2 MLP's with 4 layers (1 shared); and b) Sequential: the second MLP with 5 layers uses a frozen layer from the first. Prosodic Features Set 1 is composed of 44 features described in the work developed by<ref type="bibr" target="#b19">[20]</ref> while Prosodic Features Set 2 is composed of 56 features provided by the SE&amp;R shared-task on SER and described in Section 2.3.</figDesc><graphic coords="8,282.09,94.15,221.09,179.97" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Features used in the classifiers of the feature selection experiment.</figDesc><table><row><cell>1</cell><cell>Min_intensity</cell><cell>16 Q1_pitch</cell></row><row><cell>2</cell><cell cols="2">Relative_min_intensity_time 17 Q3_pitch</cell></row><row><cell>3</cell><cell>Max_intensity</cell><cell>18 Mean_absolute_pitch_slope</cell></row><row><cell>4</cell><cell cols="2">Relative_max_intensity_time 19 Pitch_slope_without_octave_jumps</cell></row><row><cell>5</cell><cell>Mean_intensity</cell><cell>20 Center_of_gravity_spectrum</cell></row><row><cell>6</cell><cell>Stddev_intensity</cell><cell>21 Stddev_spectrum</cell></row><row><cell>7</cell><cell>Q1_intensity</cell><cell>22 Skewness_spectrum</cell></row><row><cell>8</cell><cell>Median_intensity</cell><cell>23 Kurtosis_spectrum</cell></row><row><cell>9</cell><cell>Q3_intensity</cell><cell>24 Central_moment_spectrum</cell></row><row><cell cols="2">10 Min_pitch</cell><cell>25 Voiced_fraction</cell></row><row><cell cols="2">11 Relative_min_pitch_time</cell><cell>26 Band_energy</cell></row><row><cell cols="2">12 Max_pitch</cell><cell>27 Band_density</cell></row><row><cell cols="2">13 Relative_max_pitch_time</cell><cell>28 Band_energy_difference</cell></row><row><cell cols="2">14 Mean_pitch</cell><cell>29 Band_density_difference</cell></row><row><cell cols="2">15 Stddev_pitch</cell><cell></cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="1" xml:id="foot_0">There are large lists of datasets used for emotion recognition in<ref type="bibr" target="#b0">[1]</ref> and<ref type="bibr" target="#b3">[4]</ref>.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_1">Project's github: https://github.com/BrunoGianesi/Speaker-Gender-Recognition.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_2">We consider primary voices to be the loudest, and secondary voices to be the least prominent in the audio.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_3">https://igormq.github.io/datasets/</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_4">https://www.linguateca.pt/cetenfolha/index_info.html</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_5">For one of the audios, the algorithm could not produce a successful conversion.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="7" xml:id="foot_6">The feature voiced_fraction was allocated in the group of spectrum features, instead of with the pitch group.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="8" xml:id="foot_7">For instance, 80-200 Hz for adult males, 180-400 Hz for adult females<ref type="bibr" target="#b28">[29]</ref>, and higher ranges for children. The mean values change for older ages.</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This research was carried out at the Center for Artificial Intelligence (C4AI-USP), with support by the São Paulo Research Foundation (FAPESP grant 2019/07665-4) and by the IBM Corporation.</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 2</head><p>Sequential Learning results using 5-fold cross-validation. We indicate in the model's name which feature set was used and whether a data augmentation technique was used (+) or was not used (-). The last line indicates the value of F1-macro for the submitted model, using the full dataset.  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4">Preliminary Evaluation of the Selected Models.</head><p>Table <ref type="table">5</ref> shows the confusion matrices for the first fold (20% of data), related to the three selected models. In the matrices, rows are termed as actual/true class and columns are termed as a predicted class. For the three selected models, the neutral class had the worst performance. It seems that the auxiliary task (genre classification from speech) has helped in classifying non-neutral male and non-neutral female classes.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Conclusions and Future Work</head><p>In this work, we evaluate 26 DNN models, using 5-fold cross-validation over the training dataset, and submitted our best models, i.e. those with higher F1-macro, for each group of experiments in Sections 3.1, 3.2, and 3.3. One of the submitted models surpassed the prosodic features baseline, reaching 0.5353 F1-macro. As a future work, we will perform an error analysis to  </p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Speech emotion recognition: Emotional models, databases, features, preprocessing methods, supporting modalities, and classifiers</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">B</forename><surname>Akçay</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Oğuz</surname></persName>
		</author>
		<idno type="DOI">10.1016/j.specom.2019.12.001</idno>
		<idno>doi:</idno>
		<ptr target="https://doi.org/10.1016/j.specom.2019.12.001" />
	</analytic>
	<monogr>
		<title level="j">Speech Communication</title>
		<imprint>
			<biblScope unit="volume">116</biblScope>
			<biblScope unit="page" from="56" to="76" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Facial expressions of emotion</title>
		<author>
			<persName><forename type="first">P</forename><surname>Ekman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Oster</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Annual Review of Psychology</title>
		<imprint>
			<biblScope unit="volume">30</biblScope>
			<biblScope unit="page" from="527" to="554" />
			<date type="published" when="1979">1979</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<monogr>
		<title level="m" type="main">DEEP: Uma arquitetura para reconhecer emoção com base no espectro sonoro da voz de falantes da língua portuguesa</title>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">A</forename><surname>Campos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Da</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Moutinho</surname></persName>
		</author>
		<ptr target="https://bdm.unb.br/handle/10483/27583" />
		<imprint>
			<date type="published" when="2020-01-18">2020. january 18, 2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Deep cross-corpus speech emotion recognition: Recent advances and perspectives</title>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Tao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhao</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Frontiers in Neurorobotics</title>
		<imprint>
			<biblScope unit="volume">15</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Evidence for a three-factor theory of emotions</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Russell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mehrabian</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of research in Personality</title>
		<imprint>
			<biblScope unit="volume">11</biblScope>
			<biblScope unit="page" from="273" to="294" />
			<date type="published" when="1977">1977</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">The C-ORAL-BRASIL I: Reference corpus for spoken Brazilian Portuguese</title>
		<author>
			<persName><forename type="first">T</forename><surname>Raso</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Mello</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">M</forename><surname>Mittmann</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC&apos;12), European Language Resources Association (ELRA)</title>
				<meeting>the Eighth International Conference on Language Resources and Evaluation (LREC&apos;12), European Language Resources Association (ELRA)<address><addrLine>Istanbul, Turkey</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="page" from="106" to="113" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Automatic emotion recognition using prosodic parameters</title>
		<author>
			<persName><forename type="first">I</forename><surname>Luengo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Navas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Hernáez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sánchez</surname></persName>
		</author>
		<ptr target="http://www.isca-speech.org/archive/interspeech_2005/i05_0493.html" />
	</analytic>
	<monogr>
		<title level="m">INTERSPEECH 2005 -Eurospeech, 9th European Conference on Speech Communication and Technology</title>
				<meeting><address><addrLine>Lisbon, Portugal</addrLine></address></meeting>
		<imprint>
			<publisher>ISCA</publisher>
			<date type="published" when="2005">September 4-8, 2005. 2005</date>
			<biblScope unit="page" from="493" to="496" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Emotion recognition from speech using global and local prosodic features</title>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">S</forename><surname>Rao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">G</forename><surname>Koolagudi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">R</forename><surname>Vempada</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Int. J. Speech Technol</title>
		<imprint>
			<biblScope unit="volume">16</biblScope>
			<biblScope unit="page" from="143" to="160" />
			<date type="published" when="2013">2013</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<title level="m" type="main">wav2vec 2.0: A framework for self-supervised learning of speech representations</title>
		<author>
			<persName><forename type="first">A</forename><surname>Baevski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mohamed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Auli</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2006.11477</idno>
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Deep learning techniques for speech emotion recognition, from databases to models</title>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">J</forename><surname>Abbaschian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Sierra-Sosa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Elmaghraby</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Sensors</title>
		<imprint>
			<biblScope unit="volume">21</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Ensemble learning of hybrid acoustic features for speech emotion recognition</title>
		<author>
			<persName><forename type="first">K</forename><surname>Zvarevashe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Olugbara</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Algorithms</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Speech Emotion Recognition with Multi-Task Learning</title>
		<author>
			<persName><forename type="first">X</forename><surname>Cai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Church</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. Interspeech 2021</title>
				<meeting>Interspeech 2021</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="4508" to="4512" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Speech emotion recognition based on multi-task learning using a convolutional neural network</title>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">K</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">K</forename><surname>Ha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">W</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">K</forename><surname>Kim</surname></persName>
		</author>
		<idno type="DOI">10.1109/APSIPA.2017.8282123</idno>
	</analytic>
	<monogr>
		<title level="m">Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)</title>
				<imprint>
			<date type="published" when="2017">2017. 2017</date>
			<biblScope unit="page" from="704" to="707" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Improved End-to-End Speech Emotion Recognition Using Self Attention Mechanism and Multitask Learning</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Kawahara</surname></persName>
		</author>
		<idno type="DOI">10.21437/Interspeech.2019-2594</idno>
	</analytic>
	<monogr>
		<title level="m">Proc. Interspeech 2019</title>
				<meeting>Interspeech 2019</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="2803" to="2807" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Real-time speech emotion recognition using a pretrained image classification network: Effects of bandwidth reduction and companding</title>
		<author>
			<persName><forename type="first">M</forename><surname>Lech</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Stolar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Best</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Bolia</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Frontiers in Computer Science</title>
		<imprint>
			<biblScope unit="volume">2</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<title level="m" type="main">Multitask learning, Machine Learning -Special issue on inductive transfer</title>
		<author>
			<persName><forename type="first">R</forename><surname>Caruana</surname></persName>
		</author>
		<imprint>
			<date type="published" when="1997">1997</date>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="page" from="41" to="75" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Transfer learning in natural language processing</title>
		<author>
			<persName><forename type="first">S</forename><surname>Ruder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">E</forename><surname>Peters</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Swayamdipta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Wolf</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/N19-5004</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Tutorials, Association for Computational Linguistics</title>
				<meeting>the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Tutorials, Association for Computational Linguistics<address><addrLine>Minneapolis, Minnesota</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="15" to="18" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">Smote: Synthetic minority over-sampling technique</title>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">V</forename><surname>Chawla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">W</forename><surname>Bowyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">O</forename><surname>Hall</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">P</forename><surname>Kegelmeyer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">J. Artif. Int. Res</title>
		<imprint>
			<biblScope unit="volume">16</biblScope>
			<biblScope unit="page" from="321" to="357" />
			<date type="published" when="2002">2002</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<monogr>
		<author>
			<persName><forename type="first">P</forename><surname>Boersma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Weenink</surname></persName>
		</author>
		<ptr target="http://www.praat.org/" />
		<title level="m">Praat: Doing phonetics by computer</title>
				<imprint>
			<date type="published" when="2010">2010</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<title level="m" type="main">Classificação de gênero via análise de áudio utilizando métodos de aprendizado de máquina tradicionais</title>
		<author>
			<persName><forename type="first">B</forename><surname>Gianesi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Aluisio</surname></persName>
		</author>
		<ptr target="https://eesc.usp.br/biblioteca/" />
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">LSF and LPC -Derived Features for Large Vocabulary Distributed Continuous Speech Recognition in Brazilian Portuguese</title>
		<author>
			<persName><forename type="first">V</forename><forename type="middle">F S</forename><surname>Alencar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Alcaim</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Asilomar Conference on Signals, Systems and Computers</title>
				<imprint>
			<date type="published" when="2008">2008. 2008</date>
			<biblScope unit="page" from="1237" to="1241" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">MLS: A Large-Scale Multilingual Dataset for Speech Research</title>
		<author>
			<persName><forename type="first">V</forename><surname>Pratap</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sriram</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Synnaeve</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Collobert</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proc. Interspeech 2020</title>
				<meeting>Interspeech 2020</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="2757" to="2761" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Audio-based activities of daily living (adl) recognition with large-scale acoustic embeddings from online videos</title>
		<author>
			<persName><forename type="first">D</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Thomaz</surname></persName>
		</author>
		<idno type="DOI">10.1145/3314404</idno>
		<ptr target="https://doi.org/10.1145/3314404.doi:10.1145/3314404" />
	</analytic>
	<monogr>
		<title level="j">Proc. ACM Interact. Mob. Wearable Ubiquitous Technol</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Imbalanced-learn: A python toolbox to tackle the curse of imbalanced datasets in machine learning</title>
		<author>
			<persName><forename type="first">G</forename><surname>Lemaître</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nogueira</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">K</forename><surname>Aridas</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">J. Mach. Learn. Res</title>
		<imprint>
			<biblScope unit="volume">18</biblScope>
			<biblScope unit="page" from="559" to="563" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Vocal expression and communication of emotion</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pittam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Scherer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Handbook of emotions</title>
				<editor>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Haviland</surname></persName>
		</editor>
		<meeting><address><addrLine>New York</addrLine></address></meeting>
		<imprint>
			<publisher>The Guilford Press</publisher>
			<date type="published" when="1993">1993</date>
			<biblScope unit="page" from="185" to="198" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">Vocal affect expression: a review and a model for future research</title>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Scherer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Psychological Bulletin</title>
		<imprint>
			<biblScope unit="volume">99</biblScope>
			<biblScope unit="page" from="143" to="165" />
			<date type="published" when="1986">1986</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">Detecting changes in speech expressiveness in participants of a radio program</title>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">A</forename><surname>Barbosa</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">INTERSPEECH 2009, 10th Annual Conference of the International Speech Communication Association</title>
				<meeting><address><addrLine>Brighton, United Kingdom</addrLine></address></meeting>
		<imprint>
			<publisher>ISCA</publisher>
			<date type="published" when="2009">2009</date>
			<biblScope unit="page" from="2155" to="2158" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">Survey on speech emotion recognition: Features, classification schemes, and databases</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">El</forename><surname>Ayadi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">S</forename><surname>Kamel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Karray</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Pattern Recognition</title>
		<imprint>
			<biblScope unit="volume">44</biblScope>
			<biblScope unit="page" from="572" to="587" />
			<date type="published" when="2011">2011</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">A Perceptual Study of Intonation: An Experimental-Phonetic Approach to Speech Melody</title>
		<author>
			<persName><forename type="first">J</forename><surname>Hart</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Collier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Cohen</surname></persName>
		</author>
		<idno type="DOI">10.1017/CBO9780511627743</idno>
	</analytic>
	<monogr>
		<title level="m">Cambridge Studies in Speech Science and Communication</title>
				<imprint>
			<publisher>Cambridge University Press</publisher>
			<date type="published" when="1990">1990</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<analytic>
		<title level="a" type="main">Acoustic profiles in vocal emotion expression</title>
		<author>
			<persName><forename type="first">R</forename><surname>Banse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Scherer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of personality and social psychology</title>
		<imprint>
			<biblScope unit="volume">70</biblScope>
			<biblScope unit="page" from="614" to="636" />
			<date type="published" when="1996">1996</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<analytic>
		<title level="a" type="main">Vocal communication of emotion</title>
		<author>
			<persName><forename type="first">T</forename><surname>Johnstone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Scherer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Handbook of emotions</title>
				<editor>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Haviland-Jones</surname></persName>
		</editor>
		<meeting><address><addrLine>New York</addrLine></address></meeting>
		<imprint>
			<publisher>The Guilford Press</publisher>
			<date type="published" when="2000">2000</date>
			<biblScope unit="page" from="220" to="235" />
		</imprint>
	</monogr>
	<note>2 ed</note>
</biblStruct>

<biblStruct xml:id="b31">
	<analytic>
		<title level="a" type="main">Impact of intended emotion intensity on cue utilization and decoding accuracy in vocal expression of emotion</title>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">N</forename><surname>Juslin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Laukka</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Emotion</title>
		<imprint>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="381" to="412" />
			<date type="published" when="2001">2001</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Statistical analysis of acoustic characteristics of tibetan lhasa dialect speech emotion</title>
		<author>
			<persName><forename type="first">D</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Ding</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">SHS Web of Conferences</title>
		<imprint>
			<biblScope unit="volume">25</biblScope>
			<biblScope unit="page" from="1" to="5" />
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<monogr>
		<title level="m" type="main">Neural Transfer Learning for Natural Language Processing</title>
		<author>
			<persName><forename type="first">S</forename><surname>Ruder</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2019">2019</date>
			<pubPlace>Galway</pubPlace>
		</imprint>
		<respStmt>
			<orgName>National University of Ireland</orgName>
		</respStmt>
	</monogr>
	<note type="report_type">Ph.D. thesis</note>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
