<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Overview of BirdCLEF 2024: Acoustic Identification of Under-studied Bird Species in the Western Ghats</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Stefan Kahl</string-name>
          <email>stefan.kahl@cornell.edu</email>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tom Denton</string-name>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Holger Klinck</string-name>
          <email>holger.klinck@cornell.edu</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vijay Ramesh</string-name>
          <xref ref-type="aff" rid="aff5">5</xref>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Viral Joshi</string-name>
          <email>viraljoshi@students.iisertirupati.ac.in</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Meghana Srivathsa</string-name>
          <email>meghana.srivathsa@gmail.com</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Akshay Anand</string-name>
          <email>akshayvinodanand@floridamuseum.ufl.edu</email>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Chiti Arvind</string-name>
          <email>chitiarvind@students.iisertirupati.ac.in</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Harikrishnan CP</string-name>
          <email>harikrishnan.cp@students.iisertirupati.ac.in</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Suyash Sawant</string-name>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Robin V V</string-name>
          <email>robin@labs.iisertirupati.ac.in</email>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hervé Glotin</string-name>
          <email>herve.glotin@univ-tln.fr</email>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hervé Goëau</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Willem-Pier Vellinga</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Robert Planqué</string-name>
          <xref ref-type="aff" rid="aff8">8</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alexis Joly</string-name>
          <email>alexis.joly@inria.fr</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>CIRAD, UMR AMAP</institution>
          ,
          <addr-line>Montpellier</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Chemnitz University of Technology</institution>
          ,
          <addr-line>Chemnitz</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Google Deepmind</institution>
          ,
          <addr-line>San Francisco</addr-line>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Indian Institute of Science Education and Research (IISER) Tirupati</institution>
          ,
          <addr-line>Tirupati</addr-line>
          ,
          <country country="IN">India</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Inria, LIRMM, University of Montpellier</institution>
          ,
          <addr-line>CNRS, Montpellier</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>K. Lisa Yang Center for Conservation Bioacoustics, Cornell Lab of Ornithology, Cornell University</institution>
          ,
          <addr-line>Ithaca</addr-line>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Project Dhvani</institution>
          ,
          <addr-line>Bangalore</addr-line>
          ,
          <country country="IN">India</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>University of Toulon, AMU</institution>
          ,
          <addr-line>CNRS, LIS, Marseille</addr-line>
          ,
          <country country="FR">France</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>Xeno-canto Foundation</institution>
          ,
          <addr-line>Groningen</addr-line>
          ,
          <country country="NL">Netherlands</country>
        </aff>
      </contrib-group>
      <abstract>
        <p>The BirdCLEF 2024 challenge focused on the acoustic identification of understudied bird species in the Western Ghats, a biodiversity hotspot in India. This edition aimed to advance passive acoustic monitoring by tasking participants with developing reliable systems for detecting and identifying bird vocalizations from extensive soundscape recordings. Using training data provided by the Xeno-Canto community and new unlabeled soundscapes from the Western Ghats, participants addressed the challenges of domain adaptation and limited training data for many species. Participants employed techniques such as pseudo-labeling, test-time augmentation, and diverse ensembles, significantly improving model performance. Notable strategies also included the use of single-class cross-entropy and Contrastive Adversarial Domain (CAD) bottlenecks, which provided innovative solutions to acoustic data analysis challenges. The highest-scoring submission achieved an ROC-AUC score of 0.690 on the private leaderboard (0.738 on the public leaderboard), with the top 10 systems difering by only 1.5% in their scores.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;LifeCLEF</kwd>
        <kwd>bird</kwd>
        <kwd>song</kwd>
        <kwd>call</kwd>
        <kwd>species</kwd>
        <kwd>retrieval</kwd>
        <kwd>audio</kwd>
        <kwd>collection</kwd>
        <kwd>identification</kwd>
        <kwd>fine-grained classification</kwd>
        <kwd>evaluation</kwd>
        <kwd>benchmark</kwd>
        <kwd>bioacoustics</kwd>
        <kwd>passive acoustic monitoring</kwd>
        <kwd>PAM</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        Passive acoustic monitoring (PAM), which uses autonomous recording units (ARUs) to study animals
and their habitats at ecologically meaningful scales, has become an essential method in conservation
[
        <xref ref-type="bibr" rid="ref1">1</xref>
        ]. The availability of afordable, of-the-shelf ARUs has enabled extensive data collection eforts
in many regions worldwide. Typically, arrays of these recorders are deployed for long durations
(weeks to months), producing large volumes of data that provide valuable insights into the abundance
and distribution of vocalizing animals with high spatial and temporal resolution [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ]. However, PAM
faces several ongoing challenges. Data collection eforts can result in many terabytes of acoustic data
that must be eficiently managed, stored, and analyzed [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ]. In particular, the task of analyzing this
data—reliably extracting relevant signals from often complex soundscapes—is still an active area of
research. Additionally, while ample data for common species is usually available to train models, data
for rare, listed, or endangered species is often scarce. This scarcity necessitates the development of
innovative algorithmic approaches to monitor these species efectively.
      </p>
      <p>
        The Western Ghats is a mountain range that runs along the southwestern coast of India [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ]. This
region is home to very high levels of biodiversity and supports the livelihoods of millions of people.
Over 500 bird species have been reported in this region, of which several species are rare, endangered,
and endemic (see figure 1). Automated identification of calls from diferent species is challenging in
this region due to the high number of vocalizing bird species resulting in complex soundscapes with
frequently overlapping calls.
      </p>
      <p>
        The Bird Recognition Challenge (BirdCLEF) is an integral part of LifeCLEF 2024 [
        <xref ref-type="bibr" rid="ref5">5</xref>
        ], aimed at developing
robust analytical frameworks for detecting and identifying bird vocalizations in continuous soundscape
recordings. Initiated in 2014, BirdCLEF has grown into one of the largest bird sound recognition contests,
featuring tens of thousands of recordings representing up to 1,500 species [
        <xref ref-type="bibr" rid="ref6 ref7">6, 7</xref>
        ]. The 2024 edition of
BirdCLEF tasks participants with creating reliable systems for identifying bird calls within soundscapes
from the Western Ghats, despite the challenge of having limited training data for many species.
      </p>
    </sec>
    <sec id="sec-2">
      <title>2. BirdCLEF 2024 Competition Overview</title>
      <p>
        Recent progress in machine listening techniques for identifying animal vocalizations has significantly
improved our ability to analyze long-term acoustic datasets comprehensively [
        <xref ref-type="bibr" rid="ref8 ref9">8, 9</xref>
        ]. Nevertheless,
achieving high precision and recall remains challenging, especially when dealing with numerous
species simultaneously. A key dificulty in acoustic event detection and classification lies in bridging
the gap between high-quality training samples (focal recordings) and noisy test samples (soundscape
recordings). The 2024 BirdCLEF competition, hosted on Kaggle1, tackled this complex issue by tasking
participants with identifying bird calls in soundscape recordings from the Western Ghats in India. The
competition followed the "code competition" format, encouraging participants to share their code for
the benefit of the community, particularly scientists and practitioners monitoring bird populations
for conservation in India. Additionally, submissions were required to complete inference within two
hours to ensure the models could run eficiently on the modest computing resources available to
conservationists.
(b) Chestnut-headed Bee-eater
(c) Gray-headed Canary-Flycatcher
(d) Velvet-fronted Nuthatch
      </p>
      <sec id="sec-2-1">
        <title>2.1. Goal and Evaluation Protocol</title>
        <p>This year’s competition featured two major changes compared to the previous few years: A new metric
was used for evaluation (macro-averaged ROC-AUC that skips classes that have no true positive labels),
and inference was limited to two CPU hours.</p>
      </sec>
      <sec id="sec-2-2">
        <title>2.2. Metric</title>
        <p>This year, we used class-averaged ROC-AUC as the competition metric. ROC-AUC is best considered a
rank-based metric: it is the probability that a positive example scores higher than a negative example
when the positive and negative examples are independently chosen uniformly at random. We compute
the ROC-AUC independently for each class present in the test data and then average over classes to
obtain the model score.</p>
        <p>
          As a threshold-free metric, ROC-AUC allows comparing overall model quality, without requiring
participants to engage in dificult (and opaque) threshold-selection processes. It is also, by construction,
indiferent to the positive/negative label balance within the dataset, though values can be noisy for
extremely rare classes [
          <xref ref-type="bibr" rid="ref10">10</xref>
          ].
        </p>
      </sec>
      <sec id="sec-2-3">
        <title>2.3. Time Limits</title>
        <p>Competitors were limited to two hours of inference time on a CPU. This ensures that models are
cost-efective for real-world usage. A side efect is reducing the impact of ensembling, a common Kaggle
tactic obscuring underlying model quality.</p>
      </sec>
      <sec id="sec-2-4">
        <title>2.4. Dataset</title>
        <p>2.4.1. Training Data
As in previous editions, the training data for the competition was sourced from the Xeno-Canto
community, comprising over 25,000 recordings spanning 182 species. Participants were permitted to
use metadata to enhance their systems and to download/utilize additional Xeno-Canto recordings.
Additionally, we ofered detailed information on the locations and times of both focal and soundscape
recordings, enabling participants to consider the spatio-temporal occurrence patterns of bird species in
their analyses.</p>
        <p>In addition, we supplied 8,444 unlabeled soundscape recordings from the same sites as the test
data, though recorded on diferent dates to ensure no overlap. Participants were allowed to use these
recordings to fine-tune their models or apply them for unsupervised learning during model training.
2.4.2. Test Data
As in previous years on Kaggle, the test data was completely hidden from participants. Hidden test data
consisted of 1,073 soundscape recordings of 4-minute duration and were recorded at multiple locations
within the Western Ghats. Most of the audio data was collected across the Anamalai and the Palani
hills. These hill ranges largely consist of mid-elevation tropical wet evergreen rainforests and span an
elevational gradient of ∼ 700 meters to 2,300 meters above sea level.</p>
        <p>
          Acoustic data were collected as part of an ongoing project to assess the impacts of ecological
restoration work on bird diversity. Across a gradient of forest regeneration (consisting of actively
restored, naturally regenerating, and undisturbed benchmark forest sites, see Figure 2), AudioMoth
ARUs were deployed to collect acoustic data [
          <xref ref-type="bibr" rid="ref11">11</xref>
          ]. These passive monitoring devices were placed on
trees, approximately 2 meters above the ground at each site. Using a sampling rate of 48 kHz and a gain
of 40 dB, each recorder was deployed to record data in 4-minute segments every 5 minutes for seven
consecutive days at each site between March 2020 to January 2021. (Data could not be collected in
April 2020 due to the COVID-19 pandemic). For more details, please see [
          <xref ref-type="bibr" rid="ref12">12</xref>
          ].
        </p>
        <p>(a) Naturally regenerating rainforest
(b) Protected area rainforest</p>
        <p>We identified all vocalizing bird species at a given site on a subset of the data recorded across each site.
Each audio segment was broken down into 10-second audio segments for bird species identification. This
was the shortest time period necessary to identify vocalizing bird species accurately. The annotation
process resulted in 13,701 labels for 108 species.</p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Results</title>
      <p>A total of 974 teams with nearly 1,200 competitors participated in the BidCLEF 2024 competition,
submitting a total of 30,118 runs. As in recent years, two-thirds of the test data was allocated to the
private leaderboard and one-third to the public leaderboard. Based on the ROC-AUC metric, the baseline
score was 0.5, with random confidence scores for all birds across all segments. The highest-scoring
submission achieved 0.690 (0.738 on the public leaderboard), with the top 10 systems difering by
only 1.5% in their scores. There was a notable shake-up in the ranking between the public and the
private leaderboard. While the top teams largely maintained their positions, many lower-ranked teams
experienced significant drops due to the influence of a highly efective public code notebook 2, which
led to many ranks being assigned based on execution date.</p>
      <sec id="sec-3-1">
        <title>3.1. Online write-ups</title>
        <p>A few common themes from online write-ups3 emerged in the top solutions: the use of pseudo-labeling
for unlabeled data, the implementation of test-time augmentation, and the deployment of diverse
ensembles.</p>
        <p>
          The public unlabeled data was a new addition to this year’s competition, and perhaps unsurprisingly,
many of the top competitors found ways to take advantage of it. Pseudo-labeling in this context
provides aspects of both domain adaptation and knowledge distillation. Domain adaptation helps
models cope with distributional diferences between the train and test data: in the bioacoustic context,
this includes changes in class frequency, geographic variation in vocalizations (dialects), and diferences
in recording characteristics (signal-to-noise ratio, device characteristics, and/or compression artifacts).
When only unsupervised data is available for adaptation, as in this competition, the problem is known
as source-free domain adaptation (SFDA). The SFDA task is particularly challenging in the multi-class,
multi-label context [
          <xref ref-type="bibr" rid="ref13">13</xref>
          ]. Pseudo-labeling can also be interpreted as a form of knowledge distillation,
as the pseudo-labels can be produced by large, pre-trained models (or ensembles); many of the top
teams used models too slow for submission (such as the Google Perch classifier) or larger ensembles to
produce pseudo-labels on the unlabeled data and the weakly-labeled Xeno-canto data.
        </p>
        <p>Most of the top competitors also used a specific form of test-time augmentation: producing predictions
for time-shifted audio windows and averaging with the predictions for the target window. This provides
diverse views of the target data for the ensemble.</p>
        <p>Finally, two competitors (in 4th and 5th place) produced a raw-waveform model, which ran
in an ensemble with the standard spectrogram models. While these models underperformed
spectrogram-based models individually, they improved the overall ensemble, presumably by obtaining
diverse features from the audio. These competitors were the highest-ranking competitors who did not
use pseudo-labeling, which suggests that this is a strong technique, orthogonal to pseudo-labeling.</p>
        <p>
          Overall, the message from the top competitors is clear: robust pseudo-labeling strategies and diverse
ensembles (whether from test-time augmentation or raw-waveform members) consistently made a
significant impact. Two unique strategies were also notable among the top ten submissions. The
ifrst-place submission employed single-class cross-entropy for training, noting that multi-label samples
were relatively rare in the unlabeled data. This approach provided strong regularization during model
training but also necessitated additional eforts to generate meaningful per-class predictions at test
time. The ninth-place submission utilized a Contrastive Adversarial Domain (CAD) bottleneck to
obtain domain-invariant features [
          <xref ref-type="bibr" rid="ref14">14</xref>
          ], ensuring that model embeddings for the training data were
indistinguishable from those of the unlabeled in-domain data, efectively minimizing domain-shift
issues.
        </p>
        <sec id="sec-3-1-1">
          <title>2https://www.kaggle.com/code/zulqarnainalipk/birdclef-2024-species-identification-from-audio</title>
          <p>3Individual write-ups can be accessed via the "Solution" icon on the leaderboard: https://www.kaggle.com/competitions/
birdclef-2024/leaderboard</p>
        </sec>
      </sec>
      <sec id="sec-3-2">
        <title>3.2. Working notes</title>
        <p>
          We accepted seven working notes for the proceedings, which document the approaches and
methodologies used by individual teams:
Dmitriev, Konstantin V. [
          <xref ref-type="bibr" rid="ref15">15</xref>
          ]: The author used semi-supervised and self-supervised labeling to create
pseudo-labels for unlabeled datasets, applied data augmentation techniques like MixUp and CutMix, and
employed advanced post-processing such as sliding window averaging. Data preprocessing methods
standardized recording lengths, and additional noise sources such as trafic, human voices, and weather
sounds were incorporated to improve model generalization. Location data was utilized to address
geographical variations in bird calls, and inference time optimization was achieved using techniques
like weight rounding and conversion to eficient frameworks such as ONNX and OpenVino. The highest
score achieved by the participant was a public leaderboard score of 0.684 and a private leaderboard
score of 0.6374.
        </p>
        <p>
          Hong, Lihang [
          <xref ref-type="bibr" rid="ref16">16</xref>
          ]: This participant employs semi-supervised and self-supervised labeling of
soundscapes, knowledge distillation, and data augmentation. Of-the-shelf models BirdNET [
          <xref ref-type="bibr" rid="ref8">8</xref>
          ] and the Google
Bird Vocalization Classifier 5 were used to label large unlabeled datasets, which were then employed in
training. Data augmentation techniques such as MixUp and CutMix were used. The combined approach
of using labeled soundscapes and knowledge distillation significantly improved performance, achieving
a maximum private leaderboard score of 0.681 (public leaderboard score 0.695).
        </p>
        <p>
          Witting et al. [
          <xref ref-type="bibr" rid="ref17">17</xref>
          ]: The authors implemented a combination of data augmentations and pre- and
post-processing techniques to improve model robustness. Specifically, they used noise reduction
methods, location-specific data augmentation, and temporal context adjustments. The best-performing
models incorporated spectrogram-based architectures enhanced with pseudo-labeling and test-time
4The highest scores in the working notes don’t always match the oficial leaderboard scores because participants choose two
runs for oficial scoring based only on public leaderboard performance.
5https://www.kaggle.com/models/google/bird-vocalization-classifier
augmentation, achieving a maximum private leaderboard score of 0.651 and a public leaderboard score
of 0.738.
        </p>
        <p>Lasseck, Mario [18]: The approach of this participant involves creating pseudo-labels for a large
number of unlabeled recordings from the target location and using them in training. The best-performing
models utilized the EficientNetB0 architecture with MixUp and CutMix augmentations. The method
includes pre- and post-processing techniques such as noise reduction, location-specific data augmentation,
and temporal context adjustments. Extensive experiments showed that these strategies significantly
improved performance, achieving a maximum ROC-AUC of 0.728 on the public leaderboard and 0.690
on the private leaderboard.</p>
        <p>Kumar et al. [19]: This team employed methods like using pseudo-labels for large unlabeled datasets,
data augmentations like MixUp and CutMix, and noise reduction techniques to overcome the shift
in acoustic domains. The best-performing models utilized ViT (Vision Transformer) and DeiT
(Dataeficient image Transformers) architectures with positional encoding to improve spatial context. The
training process involved cosine annealing and weighted sampling, and the use of the transformer
model presented some challenges, such as increased computational requirements and the need for
extensive pre-training. Despite these constraints, the team achieved a maximum private leaderboard
score of 0.629 (public leaderboard score 0.638).</p>
        <p>Miyaguchi et al. [20]: This team investigated the distributional shift caused by the addition of
unlabeled soundscapes, representative of the hidden test set, by using transfer learning for birdcall
classification with embeddings from pre-trained models like Google’s Bird Vocalization Classification
Model, BirdNET, and EnCodec[21]. They experimented with diferent training losses, including Binary
Cross-Entropy, Asymmetric Loss, and sigmoidF1, and proposed a pseudo multi-label classification
strategy to utilize the unlabeled data. Eficient framework conversions and targeted optimizations
addressed computational challenges posed by restricted inference runtime. The best-performing models
achieved a maximum private score of 0.586 (public 0.556).</p>
        <p>Porwal, Aaditya [22]: In this working note, the participant details an approach using an ensemble
of EficientNet-B0 and EficientNet-B1 models. EficientNet-B0 was exclusively trained on this year’s
data with heavy augmentations, while EficientNet-B1 was pre-trained on previous datasets. Mel
spectrograms were used for audio preprocessing, enhanced by augmentations like mixup and masking.
The ensemble method, combining predictions from both models, achieved a maximum private score of
0.653 and a public score of 0.663</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. Conclusions and Lessons Learned</title>
      <p>Many top-performing solutions leveraged pseudo-labeling techniques to efectively use the unlabeled
soundscape data, demonstrating the importance of domain adaptation in improving model accuracy.
Using diverse ensemble models, combining predictions from various architectures and configurations
proved critical for enhancing performance and robustness in acoustic bird identification. Addressing
the domain shift between high-quality training samples and noisy, real-world test soundscapes remains
a major challenge. Successful strategies included using domain adaptation techniques and robust
data augmentation methods like MixUp and CutMix. Balancing model complexity and inference time
within the two-hour CPU limit posed a significant challenge, leading to the development of more
eficient algorithms and optimization strategies. This greatly improves the real-world applicability
of the developed approaches and models. Submitted solutions included some innovative approaches:
The first-place submission utilized single-class cross-entropy for training, which provided strong
regularization and improved performance despite the rarity of multi-label samples. CAD was used to
obtain domain-invariant features, efectively minimizing the domain-shift issues and enhancing model
robustness. Additionally, Integrating raw-waveform models with traditional spectrogram-based models
in ensembles provided diverse feature sets and improved overall performance.</p>
    </sec>
    <sec id="sec-5">
      <title>Acknowledgments</title>
      <p>Compiling the dataset for this competition involved many people and institutions. We thank everyone
who contributed to recording, annotating, and processing this year’s data. We also want to thank Kaggle
for hosting the competition, with special thanks to Maggie Demkin and Sohier Dane for their support
in reviewing the dataset and setting up the competition. We are grateful to Google for sponsoring
the prize money. Lastly, we thank all participants for sharing their code bases and write-ups with the
Kaggle community.</p>
      <sec id="sec-5-1">
        <title>All results, code notebooks, and forum posts are publicly available at: https://www.kaggle.com/c/birdclef-2024</title>
        <p>[18] M. Lasseck, Improving Bird Recognition using Pseudo-Labeled Recordings from the Target
Location, in: CLEF Working Notes 2024, CLEF 2024: Conference and Labs of the Evaluation Forum,
September 09–12, 2024, Grenoble, France, 2024.
[19] A. S. Kumar, T. Schlosser, D. Kowerko, TUC Media Computing at BirdCLEF 2024: Improving
Birdsong Classification Through Single Learning Models, in: CLEF Working Notes 2024, CLEF
2024: Conference and Labs of the Evaluation Forum, September 09–12, 2024, Grenoble, France,
2024.
[20] A. Miyaguchi, A. Cheung, M. Gustineli, A. Kim, Transfer Learning with Pseudo Multi-Label
Birdcall Classification for DS@GT BirdCLEF 2024, in: CLEF Working Notes 2024, CLEF 2024:
Conference and Labs of the Evaluation Forum, September 09–12, 2024, Grenoble, France, 2024.
[21] A. Défossez, J. Copet, G. Synnaeve, Y. Adi, High fidelity neural audio compression, arXiv preprint
arXiv:2210.13438 (2022).
[22] A. Porwal, Bird-Species Audio Identification, Ensembling of EficientNet-B0 and Pre-trained
EficientNet-B1 model, in: CLEF Working Notes 2024, CLEF 2024: Conference and Labs of the
Evaluation Forum, September 09–12, 2024, Grenoble, France, 2024.</p>
      </sec>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>L. S. M.</given-names>
            <surname>Sugai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. S. F.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J. W. Ribeiro</given-names>
            <surname>Jr</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Llusia</surname>
          </string-name>
          ,
          <article-title>Terrestrial passive acoustic monitoring: review and perspectives</article-title>
          ,
          <source>BioScience</source>
          <volume>69</volume>
          (
          <year>2019</year>
          )
          <fpage>15</fpage>
          -
          <lpage>25</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>L. S. M.</given-names>
            <surname>Sugai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Desjonqueres</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. S. F.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Llusia</surname>
          </string-name>
          ,
          <article-title>A roadmap for survey designs in terrestrial acoustic monitoring</article-title>
          ,
          <source>Remote Sensing in Ecology and Conservation</source>
          <volume>6</volume>
          (
          <year>2020</year>
          )
          <fpage>220</fpage>
          -
          <lpage>235</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>D.</given-names>
            <surname>Tuia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Kellenberger</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Beery</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. R.</given-names>
            <surname>Costelloe</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Zufi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Risse</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Mathis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. W.</given-names>
            <surname>Mathis</surname>
          </string-name>
          , F. van
          <string-name>
            <surname>Langevelde</surname>
            ,
            <given-names>T.</given-names>
          </string-name>
          <string-name>
            <surname>Burghardt</surname>
          </string-name>
          , et al.,
          <article-title>Perspectives in machine learning for wildlife conservation</article-title>
          ,
          <source>Nature communications 13</source>
          (
          <year>2022</year>
          )
          <fpage>1</fpage>
          -
          <lpage>15</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>N.</given-names>
            <surname>Myers</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. A.</given-names>
            <surname>Mittermeier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. G.</given-names>
            <surname>Mittermeier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G. A.</given-names>
            <surname>Da Fonseca</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kent</surname>
          </string-name>
          ,
          <article-title>Biodiversity hotspots for conservation priorities</article-title>
          ,
          <source>Nature</source>
          <volume>403</volume>
          (
          <year>2000</year>
          )
          <fpage>853</fpage>
          -
          <lpage>858</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>A.</given-names>
            <surname>Joly</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Picek</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Kahl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Goëau</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Espitalier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Botella</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Deneu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Marcos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Estopinan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Leblanc</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Larcher</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Šulc</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Hrúz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Servajean</surname>
          </string-name>
          , et al.,
          <source>Overview of lifeclef</source>
          <year>2024</year>
          :
          <article-title>Challenges on species distribution prediction and identification</article-title>
          ,
          <source>in: International Conference of the CrossLanguage Evaluation Forum for European Languages</source>
          , Springer,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>A.</given-names>
            <surname>Joly</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Goëau</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Kahl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Picek</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Lorieul</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Cole</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Deneu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Servajean</surname>
          </string-name>
          , R. Ruiz De Castañeda, I. Bolon,
          <string-name>
            <given-names>H.</given-names>
            <surname>Glotin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Planqué</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.-P.</given-names>
            <surname>Vellinga</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Dorso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Klinck</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Denton</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Eggel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Bonnet</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <source>Overview of LifeCLEF</source>
          <year>2021</year>
          :
          <article-title>a System-oriented Evaluation of Automated Species Identification and Species Distribution Prediction</article-title>
          ,
          <source>in: Proceedings of the Twelfth International Conference of the CLEF Association (CLEF</source>
          <year>2021</year>
          ),
          <year>2021</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>S.</given-names>
            <surname>Kahl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Clapp</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Hopping</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Goëau</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Glotin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Planqué</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.-P.</given-names>
            <surname>Vellinga</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Joly</surname>
          </string-name>
          , Overview of BirdCLEF 2020:
          <article-title>Bird sound recognition in complex acoustic environments, in: CLEF task overview 2020, CLEF: Conference and Labs of the Evaluation Forum</article-title>
          , Sep.
          <year>2020</year>
          , Thessaloniki, Greece.,
          <year>2020</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>S.</given-names>
            <surname>Kahl</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Wood</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Eibl</surname>
          </string-name>
          , H. Klinck,
          <article-title>BirdNET: A deep learning solution for avian diversity monitoring</article-title>
          ,
          <source>Ecological Informatics</source>
          <volume>61</volume>
          (
          <year>2021</year>
          )
          <fpage>101236</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [9]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Shiu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Palmer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. A.</given-names>
            <surname>Roch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Fleishman</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.-M.</given-names>
            <surname>Nosal</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Helble</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Cholewiak</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Gillespie</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Klinck</surname>
          </string-name>
          ,
          <article-title>Deep neural networks for automated detection of marine mammal species</article-title>
          ,
          <source>Scientific reports 10</source>
          (
          <year>2020</year>
          )
          <fpage>1</fpage>
          -
          <lpage>12</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [10]
          <string-name>
            <surname>B. van Merriënboer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Hamer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Dumoulin</surname>
          </string-name>
          , E. Triantafillou, T. Denton,
          <article-title>Birds, bats and beyond: Evaluating generalization in bioacoustics models</article-title>
          ,
          <source>Frontiers in Bird Science</source>
          <volume>3</volume>
          (
          <year>2024</year>
          )
          <fpage>1369756</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          [11]
          <string-name>
            <given-names>A. P.</given-names>
            <surname>Hill</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Prince</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J. L.</given-names>
            <surname>Snaddon</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. P.</given-names>
            <surname>Doncaster</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Rogers</surname>
          </string-name>
          ,
          <article-title>Audiomoth: A low-cost acoustic device for monitoring biodiversity and the environment</article-title>
          ,
          <source>HardwareX</source>
          <volume>6</volume>
          (
          <year>2019</year>
          )
          <article-title>e00073</article-title>
          .
        </mixed-citation>
      </ref>
      <ref id="ref12">
        <mixed-citation>
          [12]
          <string-name>
            <given-names>V.</given-names>
            <surname>Ramesh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Hariharan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Akshay</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Choksi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Khanwilkar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>DeFries</surname>
          </string-name>
          , V. Robin,
          <article-title>Using passive acoustic monitoring to examine the impacts of ecological restoration on faunal biodiversity in the western ghats</article-title>
          ,
          <source>Biological Conservation</source>
          <volume>282</volume>
          (
          <year>2023</year>
          )
          <fpage>110071</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref13">
        <mixed-citation>
          [13]
          <string-name>
            <given-names>M.</given-names>
            <surname>Boudiaf</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Denton</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. Van</given-names>
            <surname>Merrienboer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Dumoulin</surname>
          </string-name>
          ,
          <string-name>
            <surname>E. Triantafillou,</surname>
          </string-name>
          <article-title>In search for a generalizable method for source free domain adaptation</article-title>
          , in: A.
          <string-name>
            <surname>Krause</surname>
            ,
            <given-names>E.</given-names>
          </string-name>
          <string-name>
            <surname>Brunskill</surname>
            ,
            <given-names>K.</given-names>
          </string-name>
          <string-name>
            <surname>Cho</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Engelhardt</surname>
            ,
            <given-names>S.</given-names>
          </string-name>
          <string-name>
            <surname>Sabato</surname>
          </string-name>
          , J. Scarlett (Eds.),
          <source>Proceedings of the 40th International Conference on Machine Learning</source>
          , volume
          <volume>202</volume>
          <source>of Proceedings of Machine Learning Research, PMLR</source>
          ,
          <year>2023</year>
          , pp.
          <fpage>2914</fpage>
          -
          <lpage>2931</lpage>
          . URL: https://proceedings.mlr.press/v202/boudiaf23a.html.
        </mixed-citation>
      </ref>
      <ref id="ref14">
        <mixed-citation>
          [14]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Ruan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Dubois</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. J.</given-names>
            <surname>Maddison</surname>
          </string-name>
          ,
          <article-title>Optimal representations for covariate shift</article-title>
          ,
          <year>2022</year>
          . arXiv:
          <volume>2201</volume>
          .
          <fpage>00057</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref15">
        <mixed-citation>
          [15]
          <string-name>
            <given-names>K. V.</given-names>
            <surname>Dmitriev</surname>
          </string-name>
          ,
          <article-title>Methods for training convolutional neural networks to identify bird species in complex soundscape recordings</article-title>
          ,
          <source>in: CLEF Working Notes</source>
          <year>2024</year>
          , CLEF 2024:
          <article-title>Conference and Labs of the Evaluation Forum</article-title>
          ,
          <source>September 09-12</source>
          ,
          <year>2024</year>
          , Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref16">
        <mixed-citation>
          [16]
          <string-name>
            <given-names>L.</given-names>
            <surname>Hong</surname>
          </string-name>
          ,
          <article-title>Domain Adaption for Birdcall Recognition: Progressive Knowledge Distillation with Semi-Supervised and Self-Supervised Soundscape Labeling</article-title>
          ,
          <source>in: CLEF Working Notes</source>
          <year>2024</year>
          , CLEF 2024:
          <article-title>Conference and Labs of the Evaluation Forum</article-title>
          ,
          <source>September 09-12</source>
          ,
          <year>2024</year>
          , Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref17">
        <mixed-citation>
          [17]
          <string-name>
            <given-names>E.</given-names>
            <surname>Witting</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Lim</surname>
          </string-name>
          , H. de Heer, C. T. Kopar,
          <string-name>
            <given-names>K.</given-names>
            <surname>Sándor</surname>
          </string-name>
          ,
          <article-title>Addressing the Challenges of Domain Shift in Bird Call Classification for BirdCLEF 2024</article-title>
          , in:
          <source>CLEF Working Notes</source>
          <year>2024</year>
          , CLEF 2024:
          <article-title>Conference and Labs of the Evaluation Forum</article-title>
          ,
          <source>September 09-12</source>
          ,
          <year>2024</year>
          , Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>