<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-title-group>
        <journal-title>Seminar of the Spanish Society for Natural
Language Processing: Projects and System Demonstrations, June</journal-title>
      </journal-title-group>
    </journal-meta>
    <article-meta>
      <title-group>
        <article-title>IKER-GAITU: Research on Language Technology Basque and Other Low-Resource Languages</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Eneko Agirre</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Itziar Aldabe</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Xabier Arregi</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Mikel Artetxe</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Unai Atutxa</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ekhi Azurmendi</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Iker De la Iglesia</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Julen Etxaniz</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Victor García-Romillo</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Inma Hernaez-Rioja</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Asier Herranz</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Mikel Iruskieta</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Oier López de Lacalle</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Eva Navas</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Paula Ontalvilla</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Aitor Ormazabal</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Naiara Perez</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>German Rigau</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Oscar Sainz</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jon Sanchez</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ibon Saratxaga</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Aitor Soroa</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christoforos Souganidis</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Jon Vadillo</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Aimar Zabala</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>HiTZ Basque Center for Language Technology - Aholab Signal Processing Laboratory, University of the Basque Country UPV/EHU</institution>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>HiTZ Basque Center for Language Technology - Ixa NLP Group, University of the Basque Country UPV/EHU</institution>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2024</year>
      </pub-date>
      <volume>1</volume>
      <fpage>9</fpage>
      <lpage>20</lpage>
      <abstract>
        <p>The general objective of the IKER-GAITU project is to research on language technology to increase the presence of Basque in the digital environment. It will be carried out between 2023 and 2025 thanks to a grant from the Department of Culture and Language Policy of the Basque Government. Current techniques require enormous amounts of textual and oral data per language. On the other hand, the data available for Basque and other low-resource languages might not be enough to attain the same quality as larger languages with the current technology. For this reason, it is essential to research on language technology, so that low-resource languages are present with the same quality as the rest of the languages in these technologies. IKER-GAITU pursues the following research objectives: 1. A system that automatically captures the level of Basque proficiency, written and oral; 2. Bring personalized voice technology to people with disabilities; 3. Spontaneous voice transcription, both when Basque and Spanish are mixed and when there are several speakers; 4. Textual conversational systems in Basque that match the quality of the most powerful large language models. In this project summary we present the results for the first year. More information at https://hitz.eus/iker-gaitu.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;Educational Applications</kwd>
        <kwd>Speech Synthesis</kwd>
        <kwd>Speech Recognition</kwd>
        <kwd>Large Language Models</kwd>
        <kwd>Low Resource Languages</kwd>
        <kwd>Basque</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <sec id="sec-1-1">
        <title>The general objective of the IKER-GAITU project is to</title>
        <p>research on language technology to increase the presence
of Basque in the digital environment. It will be carried
out between 2023 and 2025 thanks to a grant from the
Department of Culture and Language Policy of the Basque
Government.</p>
        <p>The digital revolution has reached global languages,
and to some extent also other languages such as Basque.
Language technology is one of the most fruitful fields
of Artificial Intelligence, which is having a profound
impact on society. Examples of this impact are the
current dialogue systems, both text-based, such as GPT, or
voice-based, similar to Siri or Alexa. Unfortunately, such
systems perform better in global languages such as
Spanish or French, which can create a dangerous gap to the
detriment of Basque.</p>
        <p>Current techniques require enormous amounts of
textual and oral data per language. On the other hand, the
data available for Basque and other low-resource
languages might not be enough to attain the same quality
as larger languages with the current technology. For this
reason, it is essential to research on language technology,
so that low-resource languages are present with the same
quality as the rest of the languages in these technologies.</p>
        <p>IKER-GAITU pursues the following research
objectives for the three-year period, organised in four working
packages:
• A system that automatically captures the level of</p>
        <p>Basque competence, written and oral;
• Bring personalized voice synthesis technology to
people with disabilities;
• Spontaneous voice transcription, both when
Basque and Spanish are mixed and when there
are several speakers;
• Textual conversational systems in Basque that
match the quality of the most powerful large
language models.</p>
      </sec>
      <sec id="sec-1-2">
        <title>In all these objective systems, the aim is to achieve</title>
        <p>a quality suficient to be integrated in applications that
reach society as soon as possible. For this purpose the
results, data and algorithms that are created are being
distributed openly.1</p>
      </sec>
      <sec id="sec-1-3">
        <title>1https://hitz.eus/iker-gaitu</title>
        <p>For the first year, the specific objectives where the fol- Regarding the experimental setting, we propose two
lowing: 1) A first prototype for checking the proficiency scenarios for the evaluation of these models. In the first
level of written Basque; 2) A freely available foundation one, we have randomly distributed the training,
developmodel called Latxa, the largest language model built for ment and test partitions, whereas in the second one, we
Basque; 3) A first prototype of an open dialogue system. distributed the exercises by years. The random
distribu</p>
        <p>In the following sections we present the work per- tion implies that texts from the same period of analysis
formed in each of the working packages. can appear in both the training and test partitions.</p>
        <p>The best results obtained in these preliminary
experiments are close to 80% F1score in the case of random
2. Assessing the competence in distribution and 66% F1score when the distribution is
Basque done annually. There is no doubt that these results are
strongly conditioned by the scarcity of data. Our
hypothThis work package addresses the task of assessing the esis is that the performance and generalization ability of
written and oral competencies of Basque language learn- the classifier will improve by training with more text. In
ers, according to the levels established in the CEFR stan- the meantime we continue to investigate techniques to
dard (Common European Framework of Reference for improve learning with little data.</p>
        <p>Languages) for evaluation. With regard to the oral part of the C1 level assessment,</p>
        <p>The idea is to automatically evaluate written docu- we have begun to compile the resources and define the
ments or audios generated by the students. As a first ap- environment needed to perform an oral evaluation. First
proach, we will develop a binary classifier, which, given a of all, we defined a specific setup for the oral test: it would
text or an audio, will determine whether or not it would consist on an oral test to be guided by the computer. The
reach the C1 level. We propose the development of a student will perform the test individually and
indepenneural classifier, for which it is essential to have labeled dently, answering to the questions by voice. Thus, the
data, either text or audio, with which to train the system. format of the evaluation procedure does not match those</p>
        <p>HABE 2, an association dedicated to the teaching of been used so far over more than 15 years. Current
stanBasque and in charge of the evaluation of oficial tests, dard tests are conducted by one or two examiners who
has numerous data in diferent formats: electronic docu- evaluate two candidate students simultaneously,
generments, manuscripts and recordings or audios. These files ally by giving the students to make a monologue on a
are labeled with their respective evaluation, correspond- particular theme.
ing to the CEFR level. Through HABE, recordings of the C1-level tests of the</p>
        <p>In the design of the system, two autonomous processes last 15 years have been obtained for a total of about 150
have been distinguished: the one corresponding to the hours (448 audio files containing all recordings of test
evaluation of the written texts and the one corresponding sessions). In addition to the recordings and the label
to the oral exercises. ’pass/fail’, the scores obtained by each student according</p>
        <p>As far as written texts is concerned, the initial task to five criteria are available: fluidity, richness, correctness,
involved the extraction of editable text from a corpus of suitability and coherence. Thus, the recordings can be
157,268 manuscripts for communication levels B1, B2, C1 used to train and test a classifier of C1-level (pass/fail
and C2 in Basque. This transcription of handwritten text classification), if the audio segments corresponding to
is a complicated task, since all the manuscripts are by each student are extracted, transcribed and annotated.
diferent authors and the aim is to faithfully collect the However, the quality and characteristics of the
comoriginal text without any kind of correction. piled audio files are diverse and pilot experiments of</p>
        <p>
          The neural evaluator will be trained on these tran- automatic transcription gave very poor results. For this
scribed documents. However, without being waiting for reason a sample set of 16 files have been selected,
reprethe transcription work to be completed, we carried out a senting the diferent recording quality found in the files.
set experiments leveraging the available exercises in elec- The aim is to establish labelling conditions so that
stutronic format, about 800 exercises in all. On the one hand, dents’ audios can be extracted to be used in the training to
we fine-tuned the RoBERTa encoder for Basque [
          <xref ref-type="bibr" rid="ref1">1</xref>
          ]3 as re- train the classification system. Presently, we are
prepargression model and a binary classifier. On the other hand, ing the process of annotating the students interventions
we experimented with a preliminary version of Latxa, us- and their corresponding transcriptions. In a near future,
ing it in as fine-tuned classifier and as prompting-based we hope to use those audios to train a C1 Level
classimodel, with only 250 examples in the latter case. ifer and to perform experiments where the automatic
transcription will also be used for a final decision.
        </p>
      </sec>
      <sec id="sec-1-4">
        <title>2Helduen Alfabetatze eta Berreuskalduntzerako Erakundea.</title>
        <p>3https://huggingface.co/ixa-ehu/
roberta-eus-euscrawl-large-cased</p>
      </sec>
    </sec>
    <sec id="sec-2">
      <title>3. Personalised voice synthesis</title>
      <p>The aim of this work package is to advance the
development of synthesis technologies that facilitate
communication between people with oral disabilities with special
focus on Basque. To this end, the following objectives
shall be pursued:
• To develop the technology necessary to create
personalized voices
• To implement voice models in environments with
limited capacity (embedded models)
• The voice models have migrated to ONNX (to be
used in a mobile device) .
• For standard voices, three bilingual (ES/EU)
voices (women, men and children) have been
developed. These voices cannot be used for
commercial purposes without explicit permission from
the speaker.
• High-quality children’s voices have been acquired
and personalised for them. Those voices will be
available on the voice bench.</p>
      <sec id="sec-2-1">
        <title>The demonstrations of the work done can be found</title>
        <p>here.</p>
        <p>
          Voice personalization requires the use of a TTS neural
architecture that having been trained with a high amount
or audio data, it is able to adapt to a new voice using only 4. Spontaneous speech
a few recordings from the new speaker. A number of transcription
architectures based on Tacotron2 [
          <xref ref-type="bibr" rid="ref2">2</xref>
          ], FastSpeech2 [
          <xref ref-type="bibr" rid="ref3">3</xref>
          ]
and VITS [
          <xref ref-type="bibr" rid="ref4">4</xref>
          ] have been evaluated and its performance The main objective of this work package is to obtain open
tested in a variaty of experiments. Additionally, several ASR models able to perform transcription on bilingual
vocoders have been trained, mostly based on the HiFi- (Basque-Spanish) environments. For the first year, the
GAN architecture [
          <xref ref-type="bibr" rid="ref5">5</xref>
          ], to minimize computational load goal was to define the data sets required for bilingual
without diminishing the quality of voice obtained. automatic transcription and the sources from which they
        </p>
        <p>
          Several models have been trained with recordings ob- should be extracted. Simultaneously, diferent
architectained from voice talents, using diferent architectures. tures are being tested with the available data.
Several experiments have also been conducted to evaluate
diferent training and fine-tuning strategies. In addition, 4.1. Prototypes
techniques for adapting systems to language dialects have
been investigated using embedding. In this context, a The available ASR architectures have been evaluated.
research stay has been held at the OFAI (Vienna, Austria) Priority has been given to those who allow the open use
and a publication at an international congress [
          <xref ref-type="bibr" rid="ref6">6</xref>
          ]. In the of the models. The main architectures that have been
future these techniques will be used to adapt the system investigated are the Conformer-CTC and the Conformer
to most relevant Basque dialects. Transducer within the Nvidia-NeMo framework 4.
        </p>
        <p>Another area of application of these technologies has
been the development of personalized voices for children. 4.2. Data
Special eforts have been done to obtain recordings of
children’s quality voices, to create diferent synthetic A number of audio and text databases have been used to
voices for children. train the prototypes. As for the audio data used to train</p>
        <p>
          Finally, two neural architectures using Tacotron2 and the models, the databases in Basque used have been the
VITS have been chosen to automate the personalization last two versions from Mozilla Common Voice (16 and
process from a small set of recordings and to be imple- 17) [
          <xref ref-type="bibr" rid="ref7">7</xref>
          ], OpenSLR, and the recordings from the Basque
mented to run on diferent platforms: Linux, Windows Parliamente recently published 5. See Table 1 for the
and Android. To do this, diferent options were explored amount of hours for each dataset.
and neural models were transferred to the ONNX plat- The language model used was generated mainly using
form. Using ONNX, new neural networks have been Wikipedia and has about 27 M sentences.
integrated into Aholab’s AhoTTS synthesizer.
        </p>
        <p>Thus, the following specific objectives have been 4.3. Preliminary results
achieved by 2023:
• For voice synthesis, three SoA architectures have
been studied that allow personalization. The best
among them has been chosen. The selection has
been made taking into account the ability to
personalize as well as the final quality of the
synthetic voice.</p>
        <p>Many experiments have been performed using the
described models and data. The best results so far have
been obtained using the Nvidia NeMo conformer-ctc</p>
      </sec>
      <sec id="sec-2-2">
        <title>4https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/</title>
        <p>en/stable/asr/models.html</p>
        <p>5https://huggingface.co/datasets/gttsehu/basque_parliament_
Table 1 or know how to adapt them via fine-tuning. We are
alNumber of hours in each audio dataset ready working on instruction-following models, but it is
Dataset Train Test Dev still an open research issue whether models usable by the
Common Voice 15 58 h 10.83 h – general public with similar quality to GPT can be
conCommon Voice 16 173.78 21.50 – structed for Basque. The models were developed using
Open SLR 6.45 2 – in-house GPUs, with the final models being trained on the
Basque Parliament 368 2.85 2.62 Leonardo supercomputer at CINECA under the EuroHPC
Joint Undertaking (project EHPC-EXT-2023E01-013).</p>
        <p>For the corpora, we leveraged a new corpus comprising
model fine-tuned with all the available audio data (CV 4.3M documents and 4.2B tokens after deduplication and
16) from a Spanish pretrained model 6. This model ob- filtering.
tained a WER of 2.22% when tested with CV16 test set, Addressing the scarcity of high-quality benchmarks
a WER of 9.31% when tested with OpenSLR test dataset, for Basque, we introduce four multiple-choice evaluation
and 4.22% when tested with the Basque Parliament test datasets: questions from oficial language proficiency
exdataset. The low WER obtained for the CV test dataset ams; reading comprehension questions; trivia questions
can be explained by the probable leak of sentences of CV from five knowledge areas; and questions from public
in the training text set used to generate the Language examinations.</p>
        <p>Model. An study of contamination between the databases In our extensive evaluation (cf. Figure 1), Latxa
outshowed that 43.88% of the sentences in the CV16 test-set performs all previous open models we compare to by a
was included in the Language Model training set. On the large margin. In addition, it is better than GPT-3.5 Turbo,
other hand, only 0.29% leakage was found in OpenSLR and better than GPT-4 Turbo in language proficiency
and 0.00% in the Basque Parliament test-set. The de- and understanding, despite lagging behind in reading
scribed models are available at Hugging Face 7,8. comprehension and knowledge-intensive tasks.
To assess the quality of the models, we thoroughly
eval5. Basque language model uated them on a suite of diverse and challenging tasks.
The tasks evaluate the performance of the models for a
This first year we built the Latxa model family, the largest variety of linguistic competences such as reading
compreand best-performing LLMs available for Basque. Latxa is hension, common sense reasoning, sentiment analysis,
a breed of domestic sheep native to the Basque Country, stance detection, topic classification, correference,
inferfamous for its cheese. ence and word senses (see model cards in HuggingFace</p>
        <p>Our Latxa is a family of Large Language Models (LLM) for more details on evaluation datasets and procedure).
ranging from 7 to 70 billion model parameters based on The results in the figure below show the performance of
Meta’s LLaMA models. Current LLMs exhibit incredible diferent models, with the average in the rightmost part.
performance for high-resource languages such as English, We tested the English LLaMA models as well as some of
ChatGPT being the most popular example. But, in the the best language models for Basque to date, allowing for
case of Basque and other low-resource languages, their head-to-head comparison with our models (three purple
performance is close to a random guesser, widening the bars). The figure clearly indicates the superiority of our
technological gap between high- and low-resource lan- three models, as well as the improvement of results as
guages when it comes to digital tools. We present Latxa we increase model size.
to overcome these limitations and promote the develop- Latxa models inherit the LLaMA-2 License, which
alment of LLM-based research, innovation and products lows for commercial and research use. Although based
for the Basque language. on an English LLM, these models are intended to be used</p>
        <p>
          The Latxa family of models are pre-trained base LLM with Basque text; for any other language the performance
models, without further fine-tuning on user-oriented in- is not guaranteed.
structions or preferences. These models are thus not for The corpora, models and evaluation benchmarks,
todirect use by the general public. These models are key gether with the code, form an open language model and
to building successful NLP tools for Basque. We release evaluation suite for Basque. The suite is described in [
          <xref ref-type="bibr" rid="ref8">8</xref>
          ],
these open models to be used by technicians that know and is publicly available in HuggingFace9, please refer to
how to include such base LLMs in final-user applications, the model card for more technical information and to get
started with the models.
        </p>
        <p>In addition to Latxa, we have explored whether
multilingual language models perform better when working
6https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/
models/stt_es_conformer_ctc_large</p>
        <p>7https://huggingface.co/HiTZ/stt_eu_conformer_transducer_
large
8https://huggingface.co/HiTZ/stt_eu_conformer_ctc_large</p>
      </sec>
      <sec id="sec-2-3">
        <title>9https://huggingface.co/collections/HiTZ/</title>
        <p>
          latxa-65a697e6838b3acc53677304
in English than in Basque for the same tasks, even if ma- University of the Basque Country (UPV/EHU). Aholab
chine translation is used in the process [
          <xref ref-type="bibr" rid="ref9">9</xref>
          ], whether small is a university research team and since 1995 it focuses
domain-adapted language models can be combined with its research in the areas of Text to Speech Conversion,
large generalistic language models [
          <xref ref-type="bibr" rid="ref10">10</xref>
          ] and the efect of Speech and Speaker Recognition and Speech Processing
language typology when transferring knowledge from in general. The laboratory is part of the Basque Center
one language to the other on a wide range of experiments for Language Technology (HiTZ) and the Department of
involving language models [
          <xref ref-type="bibr" rid="ref11">11</xref>
          ]. Communications Engineering of the Faculty of
Engineering of Bilbao (EIB).
        </p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>6. Conclusions</title>
    </sec>
    <sec id="sec-4">
      <title>7. Research Groups participating in the project</title>
      <p>Ixa NLP Group. Ixa12 is a research group from the
UniThe objectives of the first year have been met. On the versity of the Basque Country (UPV/EHU) that works
second year we are going to focus on data collection, in all areas of Natural Language Processing. Ixa is a
as it is the key for the objectives set in the project. In multidisciplinary group with more than 25 years of
exparallel we plan to improve the models in all four working perience, comprising computer scientists, linguists and
packages. In the case of the dialogue system, we will other disciplines. The group is based on the Computer
also focus on developing a instruction-tuned and aligned Science Faculty in San Sebastian and the Languages and
models. Computer Systems department, but many members
belong to other faculties and departments of the UPV/EHU.</p>
      <p>The group is part of the Basque Center for Language
Technology (HiTZ).</p>
      <sec id="sec-4-1">
        <title>The project is carried out by HiTZ Basque Center for</title>
        <p>Language Technology10 of the University of the Basque This research project is funded by a grant from the
DeCountry UPV/EHU, which comprises the following re- partment of Culture and Language Policy of the Basque
search groups. Government.</p>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>Acknowledgments</title>
      <p>Aholab Signal Processing Laboratory. Aholab11 is
the short name of the Signal Processing Laboratory of the
10https://hitz.ehu.eus
11https://aholab.ehu.eus/
12htpps://ixa.ehu.eus</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>M.</given-names>
            <surname>Artetxe</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Aldabe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Agerri</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Perez-de Viñaspre</surname>
          </string-name>
          , A. Soroa,
          <article-title>Does corpus quality really matter for low-resource languages?</article-title>
          , in: Y.
          <string-name>
            <surname>Goldberg</surname>
            ,
            <given-names>Z.</given-names>
          </string-name>
          <string-name>
            <surname>Kozareva</surname>
          </string-name>
          , Y. Zhang (Eds.),
          <source>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</source>
          , Association for Computational Linguistics, Abu Dhabi, United Arab Emirates,
          <year>2022</year>
          , pp.
          <fpage>7383</fpage>
          -
          <lpage>7390</lpage>
          . URL: https: //aclanthology.org/
          <year>2022</year>
          .emnlp-main.
          <volume>499</volume>
          . doi:
          <volume>10</volume>
          . 18653/v1/
          <year>2022</year>
          .emnlp-main.
          <volume>499</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Skerry-Ryan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Stanton</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. J.</given-names>
            <surname>Weiss</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Jaitly</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Xiao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Chen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Bengio</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Le</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Agiomyrgiannakis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Clark</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. A.</given-names>
            <surname>Saurous</surname>
          </string-name>
          , Tacotron:
          <article-title>Towards end-to-end speech synthesis</article-title>
          ,
          <source>in: Proceedings of INTERPSEECH, ISCA</source>
          ,
          <year>2017</year>
          , pp.
          <fpage>4006</fpage>
          -
          <lpage>4010</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Ren</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Hu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Tan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Qin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Zhao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Zhao</surname>
          </string-name>
          , T.-Y. Liu,
          <article-title>Fastspeech 2: Fast and high-quality end-toend text to speech</article-title>
          , arXiv preprint arXiv:
          <year>2006</year>
          .
          <volume>04558</volume>
          (
          <year>2020</year>
          ).
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>J.</given-names>
            <surname>Kim</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Son</surname>
          </string-name>
          ,
          <article-title>Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech</article-title>
          ,
          <source>in: International Conference on Machine Learning, PMLR</source>
          ,
          <year>2021</year>
          , pp.
          <fpage>5530</fpage>
          -
          <lpage>5540</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>J.</given-names>
            <surname>Kong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kim</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Bae</surname>
          </string-name>
          , Hifi-gan:
          <article-title>Generative adversarial networks for eficient and high fidelity speech synthesis</article-title>
          ,
          <source>Advances in Neural Information Processing Systems</source>
          <volume>33</volume>
          (
          <year>2020</year>
          )
          <fpage>17022</fpage>
          -
          <lpage>17033</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>L.</given-names>
            <surname>Gutscher</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Pucher</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Garcia</surname>
          </string-name>
          ,
          <article-title>Neural Speech Synthesis for Austrian Dialects with Standard German Grapheme-to-Phoneme Conversion and Dialect Embeddings , in: Proc. 2nd Annual Meeting of the ELRA/ISCA SIG on Under-resourced Languages (SIGUL</article-title>
          <year>2023</year>
          ),
          <year>2023</year>
          , pp.
          <fpage>68</fpage>
          -
          <lpage>72</lpage>
          . doi:
          <volume>10</volume>
          . 21437/SIGUL.2023-
          <volume>15</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>R.</given-names>
            <surname>Ardila</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Branson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Davis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Henretty</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Kohler</surname>
          </string-name>
          , J. Meyer,
          <string-name>
            <given-names>R.</given-names>
            <surname>Morais</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Saunders</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F. M.</given-names>
            <surname>Tyers</surname>
          </string-name>
          , G. Weber,
          <article-title>Common voice: A massivelymultilingual speech corpus</article-title>
          ,
          <source>in: Proceedings of the 12th Conference on Language Resources and Evaluation (LREC</source>
          <year>2020</year>
          ),
          <year>2020</year>
          , pp.
          <fpage>4211</fpage>
          -
          <lpage>4215</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>J.</given-names>
            <surname>Etxaniz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Sainz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Perez</surname>
          </string-name>
          , I. Aldabe,
          <string-name>
            <given-names>G.</given-names>
            <surname>Rigau</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Agirre</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Ormazabal</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Artetxe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Soroa</surname>
          </string-name>
          ,
          <string-name>
            <surname>Latxa:</surname>
          </string-name>
          <article-title>An open language model and evaluation suite for</article-title>
          <source>Basque</source>
          ,
          <year>2024</year>
          . arXiv:
          <volume>2403</volume>
          .
          <fpage>20266</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [9]
          <string-name>
            <given-names>J.</given-names>
            <surname>Etxaniz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Azkune</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Soroa</surname>
          </string-name>
          ,
          <string-name>
            <surname>O. L.</surname>
          </string-name>
          de Lacalle, M. Artetxe,
          <article-title>Do multilingual language models think better in english</article-title>
          ?,
          <year>2023</year>
          . arXiv:
          <volume>2308</volume>
          .
          <fpage>01223</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [10]
          <string-name>
            <given-names>A.</given-names>
            <surname>Ormazabal</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Artetxe</surname>
          </string-name>
          , E. Agirre, Comblm:
          <article-title>Adapting black-box language models through small ifne-tuned models</article-title>
          ,
          <year>2023</year>
          . arXiv:
          <volume>2305</volume>
          .
          <fpage>16876</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          [11]
          <string-name>
            <given-names>M.</given-names>
            <surname>Zubillaga</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Sainz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Estarrona</surname>
          </string-name>
          ,
          <string-name>
            <surname>O. L. de Lacalle</surname>
          </string-name>
          , E. Agirre,
          <article-title>Event extraction in basque: Typologically motivated cross-lingual transfer-learning analysis</article-title>
          ,
          <source>in: Proceedings of the 15th Conference on Language Resources</source>
          and
          <string-name>
            <surname>Evaluation (</surname>
          </string-name>
          LREC-Coling
          <year>2024</year>
          ),
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>