<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <article-id pub-id-type="doi">10.1109/CVPR.2009.5206848</article-id>
      <title-group>
        <article-title>Overview of ImageCLEFmedical 2025 - Medical Concept Detection and Interpretable Caption Generation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Hendrik Damm</string-name>
          <email>hendrik.damm@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tabea M. G. Pakull</string-name>
          <email>tabea.pakull@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Helmut Becker</string-name>
          <email>helmut.becker@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Benjamin Bracke</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Bahadır Eryılmaz</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Louise Bloch</string-name>
          <email>louise.bloch@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Raphael Brüngel</string-name>
          <email>raphael.bruengel@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Cynthia S. Schmidt</string-name>
          <email>cynthia.schmidt@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Johannes Rückert</string-name>
          <email>johannes.rueckert@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Obioma Pelka</string-name>
          <email>obioma.pelka@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henning Schäfer</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ahmad Idrissi-Yaghir</string-name>
          <email>ahmad.idrissi-yaghir@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Asma Ben Abacha</string-name>
          <email>abenabacha@microsoft.com</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alba G. Seco de Herrera</string-name>
          <email>alba.garcia@lsi.uned.es</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff7">7</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henning Müller</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff8">8</xref>
          <xref ref-type="aff" rid="aff9">9</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christoph M. Friedrich</string-name>
          <email>christoph.friedrich@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Athens University of Economics and Business, Greece Iran University of Science and Technology</institution>
          ,
          <addr-line>Tehran</addr-line>
          ,
          <institution>Iran - University of Information Technology, Ho Chi Minh City, Vietnam Hunan City University, China Rajalakshmi Engineering College, Chennai, India Universidad Europea de Valencia, Spain University of Murcia, Spain Vellore Institute of Technology, Chennai, India Morgan State University</institution>
          ,
          <addr-line>Baltimore</addr-line>
          ,
          <country>USA Chung</country>
          <institution>-Ang University</institution>
          ,
          <addr-line>Seoul</addr-line>
          ,
          <country>Republic of Korea</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Data Integration Center, Central IT Department, University Hospital Essen</institution>
          ,
          <addr-line>Essen</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Department of Computer Science, University of Applied Sciences and Arts Dortmund</institution>
          ,
          <addr-line>Dortmund</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Institute for Artificial Intelligence in Medicine (IKIM), University Hospital Essen</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Institute for Medical Informatics, Biometry and Epidemiology (IMIBE), University Hospital Essen</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>Institute for Transfusion Medicine, University Hospital Essen</institution>
          ,
          <addr-line>Essen</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>Microsoft</institution>
          ,
          <addr-line>Redmond, Washington</addr-line>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff7">
          <label>7</label>
          <institution>School of Computer Science, National University of Distance Education (UNED)</institution>
          ,
          <country country="ES">Spain</country>
        </aff>
        <aff id="aff8">
          <label>8</label>
          <institution>University of Applied Sciences Western Switzerland (HES-SO)</institution>
          ,
          <country country="CH">Switzerland</country>
        </aff>
        <aff id="aff9">
          <label>9</label>
          <institution>University of Geneva</institution>
          ,
          <country country="CH">Switzerland</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2022</year>
      </pub-date>
      <volume>36</volume>
      <fpage>248</fpage>
      <lpage>255</lpage>
      <abstract>
        <p>The ImageCLEFmedical 2025 Caption task follows challenges held from 2017-2024 and comprises three subtasks: concept detection, caption prediction, and a newly introduced explainability task. The goal is to extract Unified Medical Language System (UMLS) concepts, generate fluent captions from medical images, and provide humaninterpretable justifications for the outputs. This year's edition used an enlarged version of the Radiology Objects in COntext version 2 (ROCOv2) dataset, which was expanded with new articles and the inclusion of the optical coherence tomography (OCT) imaging modality. For concept detection, the F1-score was used to evaluate predictions against UMLS terms. For caption prediction, evaluation was updated to a composite score averaging six metrics to assess both relevance and factuality. The new explainability submissions were manually judged by a radiologist. The 2025 task attracted 80 registered research groups, with 11 teams submitting a total of 149 graded runs across the three subtasks. Top-performing systems for concept detection were predominantly based on ensembles of Convolutional Neural Networks (CNNs). For caption prediction, a general shift towards fine-tuning Vision-Language Models (VLMs) was observed, with adapted architectures like BLIP leading to strong results across the new composite metrics. Finally, the inaugural explainability task saw initial submissions of post-hoc visualizations, establishing a baseline and clarifying the need for model-intrinsic explanations in future editions.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;ImageCLEF</kwd>
        <kwd>Computer Vision</kwd>
        <kwd>Multi-Label Classification</kwd>
        <kwd>Image Captioning</kwd>
        <kwd>Image Understanding</kwd>
        <kwd>Radiology</kwd>
        <kwd>Explainable AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        ImageCLEF1 [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ] is the image–retrieval and –classification lab of the Conference and Labs of the
Evaluation Forum (CLEF) conference [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ]. ImageCLEF 2025 [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ] consists of the ImageCLEFmedical,
ImageCLEFrecommending, Image Retrieval for Arguments (Touché) and ImageCLEFToPicto labs, with
the ImageCLEFmedical lab being divided into the subtasks Caption (image–captioning), VQA
(text-toimage generation), MEDIQA-MAGIC (Multimodal And Generative TelemedICine) and GANs (generation
of medical images).
      </p>
      <p>
        The Caption task was first proposed as part of the ImageCLEFmedical [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ] in 2016. In 2017 and
2018 [
        <xref ref-type="bibr" rid="ref5 ref6">5, 6</xref>
        ] it comprised two subtasks: concept detection and caption prediction. From 2019 [
        <xref ref-type="bibr" rid="ref7">7</xref>
        ] to
2020 [
        <xref ref-type="bibr" rid="ref8">8</xref>
        ] the focus shifted to concept detection, extracting Unified Medical Language System ® (UMLS) [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ]
Concept Unique Identifiers (CUIs) from radiology images. Since 2021 [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ] both subtasks have run in
parallel again, with gradually higher-quality, manually annotated data and—in 2023—a switch from
BLEU [
        <xref ref-type="bibr" rid="ref11">11</xref>
        ] to BERTScore [
        <xref ref-type="bibr" rid="ref12">12</xref>
        ] as the primary caption-prediction metric [
        <xref ref-type="bibr" rid="ref13">13</xref>
        ]. The 2024 edition introduced
a small-scale explainability trial and an enlarged metric set.
      </p>
      <p>2025 marks the 9th edition of the ImageCLEFmedical Caption task. Building on the lessons of previous
years, the task now comprises three components:
1. Concept Detection – identification of UMLS concepts in radiology images;
2. Caption Prediction – generation of coherent captions for full images;
3. Explainability – newly promoted to an oficial subtask: participants must provide
humaninterpretable explanations for a designated subset of images, which are manually judged by
a radiologist for interpretability, relevance and creativity.</p>
      <p>For caption prediction, the overall ranking is now based on the average across these six metrics,
reflecting both relevance and factuality aspects of the generated captions.</p>
      <p>Manual creation of structured knowledge from medical images is slow and error-prone. By
benchmarking automatic systems that detect clinical concepts, compose fluent radiology captions and justify
their outputs, ImageCLEFmedical 2025 continues to stimulate research toward scalable, trustworthy
radiology-image understanding.</p>
      <p>
        As in 2024, the development data are drawn from an extended version of the Radiology Objects in
COntext Version 2 (ROCOv2) dataset [
        <xref ref-type="bibr" rid="ref14">14</xref>
        ]. For 2025, this release has been enlarged with additional,
newly released PubMed Central® Open-Access articles whose images and captions were again manually
annotated with modalities. A novelty to this year’s dataset is the inclusion of the imaging modality
optical coherence tomography (OCT), which has been retrospectively annotated for every existing
ROCOv2 image and prospectively annotated for all new articles. The final split now comprises 80 091
training, 17 277 validation, and 19 267 test radiology images, all with updated licensing curation and
UMLS (2022 AB) concept filtering.
      </p>
      <p>
        This paper presents an overview of the ImageCLEFmedical 2025 Caption task: the task design and
participation (Section 2), data creation (Section 3), evaluation methodology (Section 4), results (Section
5) and conclusions (Section 6). Further information on the other ImageCLEF 2025 tasks can be found in
Ionescu et al. [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ].
      </p>
    </sec>
    <sec id="sec-2">
      <title>2. Task and Participation</title>
      <p>
        For the 9th edition, the ImageCLEFmedical Caption task builds on two familiar subtasks:
• T1 Concept Detection. Systems predict Unified Medical Language System ® (UMLS) Concept
Unique Identifiers (CUIs) [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ] directly from radiology images, following the format introduced in
2017 [
        <xref ref-type="bibr" rid="ref5">5</xref>
        ].
1https://www.imageclef.org/ [last accessed: 2025-06-01]
• T2 Caption Prediction. Systems generate full-sentence captions for each image, a subtask that
returned in 2021 after a pause in 2019–2020.
and introduces a third, oficially-graded component:
• Exp Explainability. For a small radiologist-selected subset, each team provides one
humaninterpretable explanation (for example a heat-map, bounding boxes or a textual rationale) that
relates the image to the generated caption. This explanation is intended to clarify the model’s
decision-making process and thereby support clinicians in building trust in the model.
Explanations are judged manually by a radiologist for interpretability, clinical relevance and creativity.
      </p>
      <p>The 2025 edition also adds six evaluation metrics for caption prediction (see Section 4) and
retrospectively annotates the complete ROCOv2 corpus with the new optical coherence tomography
(OCT) modality. To compensate for the greater computational efort and occasional Docker-induced
submission problems, the limit for graded runs per team was raised to 30 for T1 and T2; previously, it
had been set at 10 runs. The Explainability Task (Exp) only allowed one submission, due to manual
evaluation efort.</p>
      <sec id="sec-2-1">
        <title>2.1. Participation Statistics</title>
        <p>Eighty research groups signed the End-User Agreement and downloaded the development data. Eleven
of them submitted runs and ten provided accompanying working-note papers. The submissions were
distributed across the tasks as follows:
• Concept Detection (T1): 9 teams, 51 graded runs.
• Caption Prediction (T2): 8 teams, 98 graded runs.
• Explainability (Exp): 2 teams, 2 graded runs.</p>
        <p>• Total: 149 graded runs.</p>
        <p>Six groups took part in both T1 and T2. Three teams (DeepLens, mapan and LekshmiscopeVIT)
focused on concept detection only, and two (CSMorgan and AI Stat Lab) entered just the
captionprediction track. Five teams, AUEB NLP Group, UIT-Oggy, CS_Morgan, sakthiii and LekshmiscopeVIT,
had already participated in 2024 and are marked with an asterisk in Table 1.</p>
        <p>The 2025 task therefore attracted a participant pool similar in size to earlier editions but generated
more graded submissions, while also promoting explainability to a fully assessed subtask.</p>
      </sec>
    </sec>
    <sec id="sec-3">
      <title>3. Data Creation</title>
      <sec id="sec-3-1">
        <title>3.1. Source and Split</title>
        <p>
          All data originate from articles in the PubMed Central® (PMC) Open-Access subset2 [
          <xref ref-type="bibr" rid="ref25">25</xref>
          ]. The
development data correspond to an extended release of ROCOv2 [
          <xref ref-type="bibr" rid="ref14">14</xref>
          ], enlarged with all papers published
between October 2022 and December 2024. Captions were only stripped of URLs and non-English
captions were dropped.
        </p>
        <p>The final dataset is split into 80 091 training, 17 277 validation and 19 267 test images (116 635 in
total).
2https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/ [last accessed: 2025-06-01]
16
14
Caption: Computed tomography images after treatment.</p>
        <p>Thoracic SMARCA4‐deficient undifferentiated tumor showing
osteolytic changes in the ribs (asterisk) is noted. However, pleural
thickening (yellow arrow) disappears and pleural effusion (yellow
arrowhead) decreases in the mediastinal window setting.</p>
      </sec>
      <sec id="sec-3-2">
        <title>3.2. Concept Extraction</title>
        <p>
          Concepts were extracted with MedCAT [
          <xref ref-type="bibr" rid="ref26">26</xref>
          ] trained on MIMIC-III [
          <xref ref-type="bibr" rid="ref27">27</xref>
          ] and mapped to UMLS 2022AB
CUIs. Only concepts occurring at least ten times and belonging to semantically “visible” TUI groups
were kept; ambiguous or spurious concepts were merged or removed through manual curation.
        </p>
      </sec>
      <sec id="sec-3-3">
        <title>3.3. Modality and Region Concepts</title>
        <p>Each image is manually labelled with an imaging-modality concept. In addition to the five modalities
used in previous editions (X-ray, CT, MRI, ultrasound, PET/PET-CT) the 2025 corpus introduces optical
coherence tomography (OCT, CUI C0920367). OCT was annotated retrospectively for the entire
archive and prospectively for new articles.</p>
        <p>Table 2 lists the modality distribution, while Table 3 details the image retrieval in medical applications
(IRMA) region counts.</p>
      </sec>
      <sec id="sec-3-4">
        <title>3.4. Concept Statistics</title>
        <p>3.5. Released Sets
• Training set: 80 091 images, 252 772 concept occurrences, 1 949 unique concepts.
• Validation set: 17 277 images, 48 761 concept occurrences, 716 unique concepts.
• Test set: 19 267 images, 24 242 concept occurrences, 702 unique concepts.
• Explainability set: 16 images (two from each modality, including two OCT cases) were selected by
a radiologist based on the clinical relevance of both the images and their corresponding captions
for manual assessment. In addition, examples of how such explanations might look like are
provided, which can be found in Figure 2.</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. Evaluation Methodology</title>
      <p>This year, the evaluation procedure was revised to reflect improved methodology and the incorporation
of new tools and metrics. As in previous editions, the subtasks were evaluated independently.</p>
      <p>Year
2022
2023
2024
2025</p>
      <p>Split
Train
Valid
Test
Train
Valid
Test
Train
Valid
Test
Train
Valid
Test</p>
      <p>In 2025, the AI4MediaBench3 by AIMultimediaLab4 was used as the challenge platform.</p>
      <p>For the concept detection subtask, the balanced precision and recall trade-of were measured in terms
of F1-scores. Like last year, a secondary F1-score is computed using a subset of concepts that was
manually curated. On the one hand, this involves the diferent image modalities (X-ray, Angiography,
Ultrasound, CT, MRI, PET, OCT, and Combined such as PET/CT). On the other hand, if applicable, for
X-ray also the anatomical code for body region examined of IRMA (cranium, chest, upper extremity,
spine, abdomen, pelvis, and lower extremity) was involved.</p>
      <p>For caption prediction, system outputs were assessed using a composite score, averaging across six
complementary metrics to jointly capture aspects of relevance and factuality. All individual scores for
each caption are summed and averaged over the number of captions, resulting in the final score.</p>
      <p>
        Relevance was evaluated using four diferent methods. The first of these is BERTScore [
        <xref ref-type="bibr" rid="ref12">12</xref>
        ], which is
a metric that computes a similarity score for each token in the generated text with each token in the
reference text. It uses the pre-trained contextual embeddings from Bidirectional Encoder Representations
from Transformers (BERT) [
        <xref ref-type="bibr" rid="ref28">28</xref>
        ]-based models and matches words by cosine similarity. In this work, the
3https://ai4media-bench.aimultimedialab.ro/ [last accessed: 2025-06-02]
4https://www.aimultimedialab.ro/ [last accessed: 2025-06-02]
pre-trained model microsoft/deberta-xlarge-mnli5 was used because it is the model that correlates best
with human scoring according to the authors6. Following best practices for caption evaluation reported
by [
        <xref ref-type="bibr" rid="ref12">12</xref>
        ], we computed Recall-based BERTScore with inverse document frequency (idf) weighting, using
idf scores derived from the test set to emphasize informative terms. The second metric, ROUGE
(RecallOriented Understudy for Gisting Evaluation [
        <xref ref-type="bibr" rid="ref29">29</xref>
        ]) score, counts the number of overlapping units such
as n-grams, word sequences, and word pairs between the generated text and the reference. Specifically,
the ROUGE-1 (F-measure) score was calculated, which measures the number of matching unigrams
between the model-generated text and a reference. The third relevance metric BLEURT (BiLingual
Evaluation Understudy with Representations from Transformers) [
        <xref ref-type="bibr" rid="ref30">30</xref>
        ] is designed to assess the quality
of natural language generation in English by leveraging a pre-trained model that has been fine-tuned to
emulate human judgments about the quality of the generated text. The strength of BLEURT lies in its
end-to-end training, which enables it to model human judgments efectively and makes it robust to
domain and quality variations. For this evaluation, the BLEURT-20 model was used.
      </p>
      <p>All of the above-mentioned metrics were computed using preprocessed captions that were lowercased
and had punctuation stripped. Numeric values were replaced with the token "number." The captions
were treated as single sentences, regardless of actual sentence boundaries. This step ensures uniformity
and focuses the evaluation on linguistic content.</p>
      <p>In addition to the text-based metrics a reference free metric was implemented. The methodology
is based on CLIPScore [31], an innovative metric that diverges from the traditional reference-based
evaluations of image captions. Instead, it aligns with the human approach of evaluating caption quality
without references by evaluating the alignment between text and image content. The original metric
employs Contrastive Language-Image Pretraining (CLIP) [32], a cross-modal model that has been
pre-trained on a massive dataset of image-caption pairs sourced from the web. For this year’s evaluation
the MedImageInsight [33] model was used instead. It is trained using medical images with associated
text and labels from a variety of domains, including X-ray, CT, MRI, OCT, and ultrasound. The model is
used to compute similarity scores between images and text.</p>
      <p>To assess the factuality of the generated captions, two complementary metrics were employed. The
UMLS Concept F1-scoreevaluates the overlap of medical entities between the generated and reference
captions. Specifically, medical concepts were extracted using MedCAT [ 34], with a focus on semantic
types relevant to clinical accuracy as also defined for the MEDCON [ 35] metric whereas MEDCON
relies on QuickUMLS [36] for concept extraction from both texts. This is followed by calculation
of the F1-score to quantify concept-level agreement. The other factuality metric, AlignScore [37],
employs a deep learning approach based on RoBERTa [38] to measure factual consistency. It involves
the decomposition of extensive texts into more manageable segments and aligning the claims in the
generated caption with the supporting evidence in the reference caption, thereby producing an average
alignment score across all claims.</p>
      <p>For the explainability extension, a radiologist was asked to rate both, the caption and the visualisation
of each image in the explainability subset on a 1-5 Likert scale, with 5 being the best score.</p>
      <p>The captions were ranked in terms of readability, clinical appropriateness, level of detail, and focus.
The readability scale ranks whether the predicted captions are readable and coherently formulated.
The clinical appropriateness evaluates whether the predicted captions match ground-truth captions
or are clinically plausible. The level of detail is used to assess whether the captions merely describe
visual findings or also interpret underlying clinical concepts. The focus validates the appropriateness of
the scope of the caption and thus penalizes short captions that lack essential observations as well as
excessively long captions that are not focused on the essentials.</p>
      <p>The visualisation was assessed based on visual-text coherence, completeness, and focus. The
visualtext coherence measures, if the visualisation is comprehensible in relation to the predicted caption. The
completeness scale assesses, whether the visualisations meet all relevant concepts. The focus validates
the appropriateness of the visualisation.
5https://huggingface.co/microsoft/deberta-xlarge-mnli [last accessed: 2025-06-05]
6https://github.com/Tiiiger/bert_score [last accessed: 2025-06-05]</p>
    </sec>
    <sec id="sec-5">
      <title>5. Results</title>
      <p>
        For the concept detection and caption prediction subtasks, Tables 5 and 6 show the best results from
each of the participating teams. The results will be discussed in this section. The full list of results are
shown in Appendix A in Tables 12, 13 and 15. Finally, Table 9 presents the results for the explainability
subtask.
5.1. Results for the Concept Detection Subtask
In 2025, 9 teams participated in the concept prediction subtask, submitting 51 graded runs. Table 5
presents the best results for each team achieved in the submissions.
AUEB NLP Group [
        <xref ref-type="bibr" rid="ref15">15</xref>
        ] The AUEB NLP Group based their approach on their past work, which
won the competition many years, but reached second place in the last year. The approach
combined CNNs (EficientNet-B0 [ 39], DenseNet-121 [40], and ConvNeXt-Tiny [41]) with
perlabel threshold optimization and ensembling strategies, including dual threshold aggregation,
and partial intersection aggregation. The team won the first place with a primary F1-score of
0.5888 and a secondary F1-score of 0.9484.
      </p>
      <p>
        DeepLens [
        <xref ref-type="bibr" rid="ref16">16</xref>
        ] The DeepLens team tackled the concept detection task with an ensemble model
pipeline which combined EficientNet-B0 [ 39] and DenseNet-121 [40] under a simple union
ensemble. Both networks were optimized with the ADAM optimizer using the Binary Cross
Entropy with Logits loss function. The output layers of the models were replaced either with
a three-layer feed-forward head or a single linear classifier to finetune the models for
multilabel prediction. The ensemble with the best micro-F1-scorevalidation score was frozen for test
inference. This method delivered the team’s best submission, securing a primary F1-scoreof 0.5766
and a secondary F1-scoreof 0.9299, which placed second overall in the competition. Furthermore,
the DeepLeans team experimented with a K-Nearest Concept-Language-Image Pre-training to
improve image-concept alignment in their ensemble strategy. Although it did not yield the best
quantitative results, it might hold interesting directions for future research.
      </p>
      <p>
        UIT-Oggy [
        <xref ref-type="bibr" rid="ref17">17</xref>
        ] For the concept detection task, the team designed MedCSRA, a novel architecture
featuring a dual-branch design that combines global semantic understanding through global
average pooling with localized class-specific residual attention (CSRA) mechanisms. Four CNN
backbones were evaluated: ResNet-101, DenseNet121, EficientNet-B4 and EficientNet-B5. All
were pre-trained on ImageNet and fine-tuned for medical multi-label classification using Binary
Cross Entropy Loss. The final prediction uses a weighted combination of the outputs from the
global and CSRA branches. ResNet-101 achieved the highest F1-score of 0.5613, demonstrating
that specialized attention mechanisms can efectively identify multiple medical concepts in
biomedical images.
      </p>
      <p>
        DS4DH [
        <xref ref-type="bibr" rid="ref18">18</xref>
        ] reformulated concept detection as an image-to-sequence task to leverage
transformerbased models capable of capturing the inherent order of UMLS codes (e.g., modality before anatomy
or pathology). They proposed a compact architecture combining a convolutional neural network
to extract low-dimensional image embeddings (as small as 16 dimensions) with a lightweight
transformer decoder (1 head, 2 layers) that autoregressively generates UMLS code sequences via
cross-attention. Beam search (width = 3) was used during decoding and improved performance.
This approach achieved an F1-score of 0.5225 and a secondary F1-score of 0.8672, ranking the
team fifth and sixth, respectively. To address class imbalance, the team experimented with
focal loss, label smoothing, and pre-trained embeddings (MedCPT [42], CUI2Vec [43]), but none
outperformed their baseline model.
      </p>
      <p>
        They observed that their model tended to produce short sequences (average length 1.3 CUIs) with
low diversity (15 unique predicted CUIs), which they attributed to dataset bias toward short and
imbalanced annotations. Applying loss masking strategies during training increased the average
sequence length to 3.0 CUIs and raised diversity to 103 unique CUIs. However, this revised model
underperformed in terms of F1-score compared to their baseline submission. The team suggested
this discrepancy may result from the challenge’s F1-score evaluation design, which potentially
favors shorter CUI sequences and penalizes longer, yet possibly correct predictions not aligned
with the ground-truth test data.
sakthiii [
        <xref ref-type="bibr" rid="ref19">19</xref>
        ] For the concept detection task, team sakthiii employed a MedCLIP-based transformer
model, which was pre-trained on medical image-caption pairs. In the first stage of their dual-stage
training pipeline, they fine-tuned this MedCLIP model specifically for concept detection. This
process involved training for 11 epochs with a batch size of 32, using the Adam optimizer and
a learning rate of 1e-5. The dataset for this stage consisted of radiology images paired with
UMLS concepts, allowing the model to learn the mappings between visual features and structured
medical terms. Their best model for concept detection achieved an F1-score of 0.4003 and a
secondary F1-score of 0.9082 , placing them eighth in this subtask.
      </p>
      <p>
        JJ-VMed [
        <xref ref-type="bibr" rid="ref20">20</xref>
        ] The JJ-VMed team employed a fine-tuned LLaVA-LLaMA 3 8B model, processing inputs
through a CLIP ViT-Large encoder. Training used prompt-based instruction tuning, and two
output formats were explored: one generating concepts independent from the caption, while the
second embedded them within full-text captions. They achieved a primary F1-score of 0.3982 and
a secondary F1-score of 0.8329, ranking them seventh in this subtask.
      </p>
      <p>
        UMUTeam [
        <xref ref-type="bibr" rid="ref21">21</xref>
        ] Based on the captions generated by a fine-tuned BLIP model, the UMUTeam employed
named entity recognition (SciSpacy), concept retrieval (SapBERT), followed by a BERT-based
reranking classifier, to extract the medical concepts for the concept detection subtask. They
achieved an F1-score of 0.2398 with a secondary F1-score of 0.5377, putting them in eighth place,
showing that this caption-based approach is inferior to multi-label classification systems.
LekshmiscopeVIT [
        <xref ref-type="bibr" rid="ref22">22</xref>
        ] Team LekshmiscopeVIT focused on a broader evaluation of diferent deep
learning architectures to approach the concept detection subtask. The team employed the standard
architectures InceptionV3, DenseNet, and ResNet as well as a custom approach. Randomly
initialized and ImageNet [44] pre-trained models of each of the standard architectures were
ifne-tuned on the ROCOv2 dataset for 10 epochs and then compared. Part of each training
pipeline was a uniform pre-processing step during which a multi-label binarizer was applied to
create a binary label matrix for training. The team further experimented with reduction of label
space complexity by limiting predictions to the most frequent concepts. The pre-trained ResNet
approach achieved the team’s best results of 0.1494 in the primary, and 0.2298 in the secondary
F1-score.
      </p>
      <p>The Concept Detection task this year revealed several methodological trends among the participating
teams. The top-performing approaches relied on convolutional neural network (CNN) ensembles,
combining multiple pre-trained architectures, such as EficientNet, DenseNet, and ResNet. These
ensembles used fine-tuned classification heads and per-label threshold optimization to improve
multilabel prediction accuracy. Both simple and complex ensembling techniques proved efective, suggesting
that leveraging the complementary strengths of diferent models remains strong.</p>
      <p>Although CNNs dominated the leaderboard, several teams explored transformer-based and generative
approaches. These included image-to-sequence formulations and vision-language models, such as
MedCLIP and LLaVA. Though these methods were less competitive in terms of F1-scores, they indicate
a growing interest in multimodal models.</p>
      <p>Lower-ranking submissions often relied on caption-based pipelines and traditional CNNs without
extensive optimization or innovative architectures. These underperformed compared to more tailored
solutions.</p>
      <p>A comparison of the 2024 and 2025 ImageCLEFmedical Concept Detection subtasks reveals a decline
in primary F1-scores across the leaderboard, suggesting that this year’s task may have been more
challenging or less suited to the models deployed.</p>
      <p>Despite this overall decline in primary performance, secondary F1-scores based on manual annotations
remained high and in some cases even improved. For example, the AUEB NLP Group, which participated
in both years, saw a drop in primary F1-score, but an increase in secondary F1-score from 0.9393 to
0.9484, reclaiming the top spot.</p>
      <p>By training and evaluating our own baseline model on the data from this year, we could determine
that about 0.1 of the diference in primary F1-score is purely due to the new test dataset, which contains
a much smaller number of unique concepts (see Table 4).</p>
      <p>The observed decline in primary F1-scores can likely be attributed to several interrelated factors
stemming from changes in the dataset. First, the slight increase in average concepts per image introduced
greater multi-label complexity, making it more dificult to make fully correct predictions under the strict
F1-score metric. Second, the broader inclusion of imaging modalities, particularly the addition of optical
coherence tomography (OCT) and expanded angiography cases, may have introduced domain shifts
that negatively afected models that were not trained or tuned on such data. Lastly, although concept
ifltering improved label quality, it may have also limited the label space, penalizing over-predictive or
less conservative systems.
5.2. Results for the Caption Prediction Subtask
In this edition, the caption prediction subtask attracted 8 teams which submitted 98 graded runs.
Tables 6, 7 and 8 present the results of the submissions.</p>
      <p>
        UMUTeam [
        <xref ref-type="bibr" rid="ref21">21</xref>
        ] The UMUTeam employed the BLIP [45] architecture, which consists of a ViT encoder
and a language model decoder, to generate captions for medical images. They fine-tuned a model
which performs well in general image captioning benchmarks, selecting the best model based on
the relevance metric. With a score of 0.9271 for Similarity, 0.5977 for BERTScore Recall, 0.2594
for ROUGE-1, 0.3230 for BLEURT and an overall score of 0.3432, they won the caption prediction
subtask, scoring highest in all but the BERTScore Recall and AlignScore metrics.
      </p>
      <p>
        DS4DH [
        <xref ref-type="bibr" rid="ref18">18</xref>
        ] developed multiple strategies for automatic medical image captioning. First, they
finetuned a Vision-Language Model (InstructBLIP-Flan-T5-XL [46]) using selective parameter freezing,
focusing training on cross-modal alignment while keeping most of the vision and language
encoders fixed. Second, they implemented a Retrieval-Augmented Generation [ 47] (RAG) approach
that retrieves visually similar training images and incorporates their captions into the prompt
to guide caption generation. Third, they introduced a Cluster-based RAG strategy that groups
training data by the semantic similarity of CUI codes using MedCPT [42] embeddings, enabling
hierarchical retrieval within medically relevant clusters. Finally, they trained an alignment model
(BioBart-v2-large [48]) using pairs of InstructBLIP-generated and ground-truth captions to refine
caption quality.
      </p>
      <p>
        Among all approaches, the fine-tuned InstructBLIP model achieved the highest overall score
(0.3708) and ranked first in the recall-based BERTScore metric ( 0.6067) among all challenge
participants. In contrast, both the alignment model and standard RAG approach underperformed,
likely due to the introduction of noisy or irrelevant information, which reflects the visual
similarity but semantic variability of radiology images. The Cluster-based RAG showed moderate
improvements over standard RAG (e.g., overall score improved from 0.3478 to 0.3620). However,
due to possible noise in predicted CUIs (F1-score= 0.5225) from the concept detection subtask, it
still fell short of InstructBLIP. On the validation dataset, Cluster RAG outperformed InstructBLIP
on several metrics when ground-truth CUIs were used. This highlights the critical importance of
accurate concept detection for precise RAG retrieval cues, because even minor inaccuracies in
CUI prediction can introduce semantic noise and significantly degrade caption quality.
AI Stat Lab [
        <xref ref-type="bibr" rid="ref24">24</xref>
        ] The team developed a modular framework for medical image captioning that begins
with a two-stage preprocessing pipeline. This includes 2× super-resolution and inpainting to
eliminate bright border artifacts. A dual-encoder setup (SigLIP2 [49] + BioMedCLIP [50]) feeds
into a Q-Former [51], which generates concept-aware tokens used for both captioning and medical
concept classification. A LoRA-tuned [ 52] Bio-Medical LLaMA-3-8B [53] serves as the decoder.
Six model variants produce captions that are either summarized using GPT-4 [54] or reranked
using custom-designed metrics: BioMedCLIP image-text alignment, BLEURT self-consensus, and
BioBERT [55] centroid proximity. Their best submission used BioMedCLIP alignment, achieved
an overall score of 0.3229 and ranked third overall.
      </p>
      <p>
        UIT-Oggy [
        <xref ref-type="bibr" rid="ref17">17</xref>
        ] For this task, the UIT-Oggy team fine-tuned the BLIP model by using Vision
Transformer (ViT) to encode images and BERT-based text decoding to generate medical captions.
Images were preprocessed to a uniform resolution of 224×224 and captions were tokenised to a
maximum length of 200 tokens, ensuring compatibility with the vision-language model’s input
requirements. The BLIP model achieved an overall score of 0.3211 for captioning, demonstrating
the efectiveness of vision-language pre-training in adapting to the terminology and context of
the medical domain.
      </p>
      <p>
        AUEB NLP Group [
        <xref ref-type="bibr" rid="ref15">15</xref>
        ] The AUEB NLP Group’s approach on caption prediction involved seven
primary systems: A finetuned InstructBLIP [ 46] model, was extended by a synthesizer and
multi-synthesizer approach, an LM-Fuser, and an Distance from Median Maximum Concept
Similarity (DMMCS) mechanism. In addition a test-time-reranker based on MedCLIP [56] and a
reinforcement learning-based Mixer were implemented. The team’s best results were reached for
the finetuned InstructBLIP model, which reached an overall rating of 0.3068 and the fifth rank in
the challenge.
      </p>
      <p>
        JJ-VMed [
        <xref ref-type="bibr" rid="ref20">20</xref>
        ] In the caption prediction task, JJ-VMed reused their LLaVA-LLaMA 3 model for initial
generation, followed by post-processing with LLaMA 3.1. With a score of 0.8251 for Similarity,
0.5953 for BERTScore Recall, 0.2389 for ROUGE-1, 0.3094 for BLEURT and an overall score of
0.3043, they ranked sixth place in the caption prediction subtask.
sakthiii [
        <xref ref-type="bibr" rid="ref19">19</xref>
        ] Following the concept detection training, the team transitioned to the caption prediction
task by reusing the same MedCLIP model weights. This second stage aimed to leverage the
semantic understanding gained during concept identification to help generate contextually relevant
textual descriptions for the images. For this task, each image was preprocessed, converted to
RGB format, and then paired with its corresponding caption from the dataset. The MedCLIP
processor and tokenization pipeline from the Transformers library were utilized to prepare these
multimodal inputs for the model. In the caption prediction task, their approach yielded scores of
0.7957 for Similarity, 0.5553 for BERTScore Recall, 0.1607 for ROUGE-1, and 0.2806 for BLEURT ,
also resulting in an eighth-rank achievement.
      </p>
      <p>
        CS_Morgan [
        <xref ref-type="bibr" rid="ref23">23</xref>
        ] The CS_Morgan team investigated six distinct captioning pipelines by fine-tuning
three vision-language backbones—Qwen-2B, Qwen2.5-3B, and SmolVLM-500M on the ROCOv2
dataset. They evaluated a vanilla LoRA-based adaptation (Submissions 1–3) and a
modalityconditioned variant (Submissions 4–6) in which a ResNet-50 classifier (trained from scratch on
four modalities: CT, MRI, Ultrasound, Radiograph) first predicts the image modality. During
inference, the predicted modality label is concatenated to the prompt (e.g., “CT image: [image].
Describe the medical image.”) to guide the caption generator toward modality-specific terminology.
Across these six runs, Qwen-2B achieved the highest Overall score (0.2537) when fine-tuned
without classification, while both Qwen2.5-3B and SmolVLM demonstrated improved BLEURT and
MedCATs scores under modality-conditioned prompting. This two-stage pipeline highlights that
even smaller models like SmolVLM-500M can approach mid-scale performance when provided
with structured modality cues.
      </p>
      <p>Baseline For this year’s baseline models in the caption prediction subtask, we utilized of-the-shelf
vision-language models to generate appropriate captions based on the challenge images.
Specifically, we evaluated the performance of the following instruction-tuned models: Meta’s LLaMA
4 Scout (17Bx16E) Instruct [57], Google DeepMind’s Gemma 3 27B Instruct [58], and Alibaba
Cloud’s Qwen2.5-VL 32B Instruct [59]. Each model was prompted individually with the challenge
images and the following standardized instruction prompt in-context:
"You are a medical expert contributing to a peer-reviewed scientific journal. Your task
is to write a caption for a medical image, exactly as it would appear beneath a figure
in a PubMed-indexed article. Concisely describe the clinical content of the image,
identifying the imaging modality, key medical concepts, anatomical structures, visible
markings, and any relevant abnormalities or pathologies. Where appropriate, include
standard abbreviations in addition to full terms for modality, medical concepts, and
pathologies (e.g., ’magnetic resonance imaging (MRI)’). Do not include any
explanations, introductions, titles, figure numbers (e.g., ’Figure 1:’ / ’Fig 1:’), references, or
bullet points. Text only the caption."
To ensure reproducibility, we employed a deterministic decoding strategy by setting the Top-k
sampling parameter to  = 1, thereby always selecting the most likely predicted token at each
step. Among the three baseline models evaluated, Meta’s LLaMA 4 Scout (17Bx16E) Instruct
model performed best, obtaining an overall challenge score of 0.3101. This result positioned it
approximately in the middle range of the submitted participant approaches. Notably, LLaMA 4
Scout achieved the highest scores in the Similarity metric (0.9369) and BLEURT metric (0.3258).</p>
      <p>In the 2025 ImageCLEFmedical Caption Prediction subtask, all participating teams used
visionlanguage models (VLMs) as the basis for their methods, showing a clear trend of using recent advances
in multimodal architectures. Most submissions used or fine-tuned Transformer-based models, such as
BLIP, InstructBLIP, and LLaMA variants. This indicates a reliance on pretrained models with strong
image-text alignment capabilities. Several teams incorporated retrieval-augmented generation (RAG),
multi-stage pipelines, or modular architectures to improve alignment with medical content. However,
performance gains from these methods varied depending on the accuracy of supporting components,
such as concept detection systems. Additionally, some teams used post-processing strategies, such
as reranking or summarization. Despite the variety of approaches, models with direct fine-tuning on
medical data and minimal architectural complexity often outperformed more elaborate pipelines. This
result highlights the continued relevance of focused adaptation.</p>
      <p>The results of the ImageCLEFmedical 2025 Caption Prediction subtask indicate a notable shift in
evaluation priorities from general linguistic similarity toward a more balanced assessment of relevance
and clinical factuality. Teams such as UMUTeam and DS4DH exhibited strong performance across both
the relevance and factuality dimensions, outperforming several returning participants.</p>
      <p>The analysis indicates that linguistic similarity metrics, such as BERTScore and ROUGE, demonstrate
a high degree of consistency with those observed in the previous year, suggesting stable performance in
terms of surface-level textual alignment. Embedding-based similarity scores are notably elevated among
the top-performing submissions, suggesting that the generated captions may encompass semantically
relevant content that extends beyond the scope of the original reference captions. This finding suggests
a potential discrepancy between lexical overlap and underlying semantic alignment. Factuality-oriented
metrics such as UMLS Concept F1-score and AlignScore remain relatively low, underscoring the inherent
dificulty of ensuring clinical accuracy in generated captions. However, reliance on the original captions
as the sole reference may limit the efectiveness of these scores in evaluating the full range of medically
plausible outputs.</p>
      <sec id="sec-5-1">
        <title>5.3. Results for the Explainability Subtask</title>
        <p>
          This year, two teams participated in the explainability subtask. Table 9 presents the summarised results
for both teams. In addition,
AUEB NLP Group [
          <xref ref-type="bibr" rid="ref15">15</xref>
          ] The AUEB NLP Group extracted UMLS concepts of the captions generated
by their finetuned InstructBLIP [ 46] model using a biomedical NER model of the ScispaCy library.
GPT-4o was used to identify bounding boxes for these concepts. The group reached the best
overall rating of 3.2 by the radiologist. However, it should be noted that the explainability
approach focuses solely on the generated captions and does not involve the black-box model
itself, which means it does not enhance the radiologist’s trust in the model’s predictions.
JJ-VMed [
          <xref ref-type="bibr" rid="ref20">20</xref>
          ] For the explainability task, JJ-VMed implemented a three-phase approach: Spatial
mapping using GPT-4 and GPT-4V to link concepts and textual descriptions with image regions,
segmentation and object detection using SAM [
          <xref ref-type="bibr" rid="ref31">60</xref>
          ] (Segment Anything Model) and YOLOv8 [
          <xref ref-type="bibr" rid="ref32">61</xref>
          ],
as well as visualisation heuristics, such as arrow-following and keypoint-detection. The outputs
included bounding boxes, segmentation masks, and heatmaps. The team achieved an overall
rating of 2.6. Similar to the winning approach, this method does not incorporate the black-box
model itself, and therefore the explanations do not contribute to increasing trust in the model’s
predictions.
        </p>
        <p>
          In summary, both approaches used bounding boxes to visualise the connection between the images and
specific concepts of the captions. The JJ-VMed team also provided heatmaps. Both visualisation methods
are clinically valid. Although similar visualisation methods were used, the underlying techniques used
for generation strongly difered. While the AUEB NLP group combined NER with GPT-4o to generate
bounding boxes, the JJ-VMed combined GPT-4V models with YOLO object detection and Segment
anything models (SAM) for segmentation. Both of these methods used to generate the explainability
visualisations are based on external models. These models have no direct integration with the black-box
model responsible for generating the captions. In conclusion, the visualizations do not contribute to
increase the clinicians’ trust in the presented captioning model. More appropriate approaches for this
task would be to use attention maps [
          <xref ref-type="bibr" rid="ref33">62</xref>
          ], GradCAM [
          <xref ref-type="bibr" rid="ref34">63</xref>
          ], or Layer-wise Relevance Propagation (LRP)
[
          <xref ref-type="bibr" rid="ref35">64</xref>
          ], to generate model-intrinsic explanations that highlight the regions or features within the image
that actually influenced the captioning output, thereby providing more meaningful insights into the
model’s decision-making process.
        </p>
        <p>During the manual validation, it was found that both participating teams were generally able to
identify the imaging modality and the approximate anatomical region depicted in the images. However,
substantial limitations were observed in the accurate identification and spatial localization of anatomical
structures and pathological findings. A recurring issue across both submissions involved the inaccurate
placement, scale, and labeling of bounding boxes. Frequently, the annotations only partially covered the
target anatomical entities or failed to capture them entirely. Both teams generated syntactically coherent
and clinically plausible captions, though with notable diferences in level of detail and accuracy. The
AUEB NLP Group demonstrated greater accuracy in the identification and localisation of anatomical
entities, resulting in more precise but less informative annotations. In contrast, JJ-VMed produced more
detailed and descriptive captions, albeit often based on incorrect concept detection.</p>
      </sec>
    </sec>
    <sec id="sec-6">
      <title>6. Conclusion</title>
      <p>The 9th edition of the ImageCLEFmedical Caption task continued its evolution with three components:
the established Concept Detection and Caption Prediction subtasks, and the promotion of Explainability
to a fully graded subtask. This year’s challenge introduced an enlarged dataset featuring the new Optical
Coherence Tomography (OCT) modality and a revised evaluation framework for captioning. The task
attracted 11 teams who submitted a total of 149 graded runs, a substantial increase in submissions
fostered by a higher run quota. Participation was balanced, with six teams entering both core subtasks,
three focusing solely on concept detection, and two on caption prediction. Two teams took on the new
explainability challenge.</p>
      <p>For the concept detection subtask, the top-performing methods continued to rely on powerful
ensembles of Convolutional Neural Networks (CNNs). However, a notable trend was the exploration of
transformer-based and generative approaches by several teams, signalling a potential shift in
methodology for future challenges.</p>
      <p>In the caption prediction subtask, a clear consensus emerged around vision-language models (VLMs),
with all teams leveraging architectures like BLIP, LLaMA, and their variants. Interestingly, direct
ifne-tuning on medical data often outperformed more elaborate pipelines, such as Retrieval-Augmented
Generation (RAG), which proved sensitive to the quality of their retrieval components, highlighting the
challenge of system interdependencies.</p>
      <p>In a reversal from 2024, primary F1-scores for concept detection saw a general decline across the
leaderboard. This is attributed to the increased dificulty of the 2025 dataset, which featured new
modalities like OCT and greater multi-label complexity. Despite this, secondary F1-scores on curated
concepts remained high, indicating that models still perform robustly on core clinical findings.</p>
      <p>The introduction of a composite score for caption prediction, averaging six metrics for relevance and
factuality, successfully shifted the focus toward a more holistic evaluation. While relevance scores were
strong, factuality metrics like UMLS F1-scoreand AlignScore remain modest across all submissions,
underscoring that generating clinically accurate text is still the primary hurdle for the field. Notably,
an of-the-shelf LLaMA 4 Scout baseline proved competitive, establishing a strong benchmark and
demonstrating that while large foundation models are powerful, specialised fine-tuning still provides a
winning edge.</p>
      <p>Looking ahead, a primary focus for the 2026 challenge will be on advancing the maturity of the
explainability task. This year’s initial submissions relied on post-hoc visualisations generated by
external models. While a valid first step, these methods do not ofer insights into the captioning model’s
internal decision-making process. Future iterations will therefore strongly encourage the development
of model-intrinsic explanations, such as attention maps or GradCAM, to foster genuine trust in the
underlying VLM. Furthermore, the 2026 edition will broaden the task’s scope and realism. The dataset
will be extended again with recent PubMed Central publications, and to address the multilinguality of
scientific literature, non-English captions will be translated and incorporated into the dataset, whereas
previously they were omitted. For images that lack a direct caption, a baseline description will be
generated for the dataset by using the context from the source article. The introduction of multilingual
data and a continued focus on model transparency are intended to stimulate further research toward
capable and reliable medical image understanding systems.</p>
    </sec>
    <sec id="sec-7">
      <title>Acknowledgments</title>
      <p>The work of Louise Bloch, Benjamin Bracke and Raphael Brüngel was partially funded by a PhD grant
from the University of Applied Sciences and Arts Dortmund (FH Dortmund), Germany. The work of
Ahmad Idrissi-Yaghir, Henning Schäfer, Tabea M. G. Pakull, Hendrik Damm, Helmut Becker, and Bahadır
Eryılmaz was funded by a PhD grant from the DFG Research Training Group 2535 Knowledge- and
databased personalisation of medicine at the point of care (WisPerMed). This work was partly supported by
the project GRESEL-UNED PID2023-151280OB-C22 funded by MICIU/AEI/ AEI 501100011033.</p>
    </sec>
    <sec id="sec-8">
      <title>Declaration on Generative AI</title>
      <p>During the preparation of this work, the authors used ChatGPT in order to: Grammar and spelling
check. After using these services, the authors reviewed and edited the content as needed and takes full
responsibility for the publication’s content.
Computational Linguistics, Online, 2020, pp. 7881–7892. doi:10.18653/v1/2020.acl-main.
704.
[31] J. Hessel, A. Holtzman, M. Forbes, R. Le Bras, Y. Choi, CLIPScore: A reference-free evaluation
metric for image captioning, in: Proceedings of the 2021 Conference on Empirical Methods in
Natural Language Processing, Association for Computational Linguistics, Online and Punta Cana,
Dominican Republic, 2021, pp. 7514–7528. doi:10.18653/v1/2021.emnlp-main.595.
[32] A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin,
J. Clark, G. Krueger, I. Sutskever, Learning transferable visual models from natural language
supervision, in: M. Meila, T. Zhang (Eds.), Proceedings of the 38th International Conference
on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event, volume 139 of Proceedings of
Machine Learning Research, PMLR, 2021, pp. 8748–8763. URL: http://proceedings.mlr.press/v139/
radford21a.html.
[33] N. C. F. Codella, Y. Jin, S. Jain, Y. Gu, H. H. Lee, A. Ben Abacha, A. Santamaria-Pang, W. Guyman,
N. Sangani, S. Zhang, H. Poon, S. Hyland, S. Bannur, J. Alvarez-Valle, X. Li, J. Garrett, A. McMillan,
G. Rajguru, M. Maddi, N. Vijayrania, R. Bhimai, N. Mecklenburg, R. Jain, D. Holstein, N. Gaur,
V. Aski, J.-N. Hwang, T. Lin, I. Tarapov, M. Lungren, M. Wei, MedImageInsight: An open-source
embedding model for general domain medical imaging, 2024. URL: https://arxiv.org/abs/2410.06542.
arXiv:2410.06542.
[34] Z. Kraljevic, T. Searle, A. Shek, L. Roguski, K. Noor, D. Bean, A. Mascio, L. Zhu, A. A. Folarin,
A. Roberts, R. Bendayan, M. P. Richardson, R. Stewart, A. D. Shah, W. K. Wong, Z. Ibrahim,
J. T. Teo, R. J. B. Dobson, Multi-domain clinical natural language processing with MedCAT:
The medical concept annotation toolkit, Artificial Intelligence in Medicine 117 (2021) 102083.
doi:10.1016/j.artmed.2021.102083.
[35] W.-w. Yim, Y. Fu, A. Ben Abacha, N. Snider, T. Lin, M. Yetisgen, Aci-bench: A Novel Ambient
Clinical Intelligence Dataset for Benchmarking Automatic Visit Note Generation, Scientific Data
10 (2023) 586. doi:10.1038/s41597-023-02487-3.
[36] L. Soldaini, N. Goharian, QuickUMLS: A fast, unsupervised approach for medical concept extraction,
in: Medical Information Search Workshop (MEDIR) at SIGIR, Pisa, Italy, 2016.
[37] Y. Zha, Y. Yang, R. Li, Z. Hu, AlignScore: Evaluating factual consistency with a unified alignment
function, in: Proceedings of the 61st Annual Meeting of the Association for Computational
Linguistics (Volume 1: Long Papers), Association for Computational Linguistics, Toronto, Canada,
2023, pp. 11328–11348. doi:10.18653/v1/2023.acl-long.634.
[38] Y. Liu, M. Ott, N. Goyal, J. Du, M. Joshi, D. Chen, O. Levy, M. Lewis, L. Zettlemoyer, V. Stoyanov,
RoBERTa: A robustly optimized BERT pretraining approach, 2019. URL: https://arxiv.org/abs/1907.
11692. arXiv:1907.11692.
[39] M. Tan, Q. V. Le, EficientNet: Rethinking model scaling for convolutional neural networks,
in: Proceedings of the International Conference on Machine Learning (ICML 2019), 2019, pp.
6105–6114.
[40] G. Huang, Z. Liu, L. Van Der Maaten, K. Q. Weinberger, Densely connected convolutional networks,
in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017),
2017, pp. 2261–2269. doi:10.1109/CVPR.2017.243.
[41] Z. Liu, H. Mao, C.-Y. Wu, C. Feichtenhofer, T. Darrell, S. Xie, A ConvNet for the 2020s, in: 2022
IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022, pp. 11966–11976.
doi:10.1109/CVPR52688.2022.01167.
[42] Q. Jin, W. Kim, Q. Chen, D. C. Comeau, L. Yeganova, W. J. Wilbur, Z. Lu, MedCPT: Contrastive
pre-trained transformers with large-scale PubMed search logs for zero-shot biomedical information
retrieval, Bioinformatics 39 (2023). doi:10.1093/bioinformatics/btad651.
[43] A. L. Beam, B. Kompa, A. Schmaltz, I. Fried, G. Weber, N. Palmer, X. Shi, T. Cai, I. S. Kohane, Clinical
concept embeddings learned from massive sources of multimodal medical data, in: Biocomputing
2020, 2019. doi:10.1142/9789811215636_0027.
[44] J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, L. Fei-Fei, ImageNet: A large-scale hierarchical image
database, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition
F1
AUEB NLP Group
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AI Stat Lab
DS4DH
DS4DH
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
sakthiii
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
UIT-Oggy
AI Stat Lab
CS_Morgan
CS_Morgan
CS_Morgan
CS_Morgan
CS_Morgan
1717
1949
1968
1939
1759
1724
1972
1407
1715
1740
1769
1760
1758
1938
1757
1408
1718
1943
1721
1723
1958
1957
1669
1954
1670
1960
1722
1720
1716
1719
1962
1961
1890
1963
1966
1967
1959
1402
1386
1245
1815
1945
1817
1955
1956
DS4DH
DS4DH
DS4DH
AI Stat Lab
AI Stat Lab
AI Stat Lab
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
DS4DH
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
UIT-Oggy
UIT-Oggy
DS4DH
UIT-Oggy
DS4DH
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
AI Stat Lab
AI Stat Lab
AI Stat Lab
DS4DH
UIT-Oggy
UIT-Oggy
UIT-Oggy
UIT-Oggy
AI Stat Lab
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AUEB NLP Group
AUEB NLP Group
AI Stat Lab
AI Stat Lab
JJ-VMed
JJ-VMed
AUEB NLP Group
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AI Stat Lab
1735
1714
1946
1900
1965
1951
1914
1922
1937
1911
1662
1952
1944
1940
1947
1908
1912
1902
1916
1525
1936
1910
1907
1906
1918
1905
1920
1909
1913
1917
1915
1941
1695
1901
1344
1224
1289
1204
1219
1673
1729
1403
1948
1463
1462
1693
1405
1896
1953
1717
1949
1968
1939
1759
1724
1972
1407</p>
      <p>AlignScore
DS4DH
DS4DH
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AI Stat Lab
AUEB NLP Group
AI Stat Lab
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
sakthiii
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
AUEB NLP Group
UIT-Oggy
AI Stat Lab
CS_Morgan
CS_Morgan
CS_Morgan
CS_Morgan
CS_Morgan
1715
1740
1769
1760
1758
1938
1757
1408
1718
1943
1721
1723
1958
1957
1669
1954
1670
1960
1722
1720
1716
1719
1962
1961
1890
1963
1966
1967
1959
1402
1386
1245
1815
1945
1817
1955
1956</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kalpathy-Cramer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <article-title>Experiences from the ImageCLEF Medical Retrieval</article-title>
          and Annotation Tasks, Springer International Publishing, Cham,
          <year>2019</year>
          , pp.
          <fpage>231</fpage>
          -
          <lpage>250</lpage>
          . URL: https://doi.org/10.1007/978-3-
          <fpage>030</fpage>
          -22948-1_
          <fpage>10</fpage>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>030</fpage>
          -22948-1_
          <fpage>10</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>N.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <surname>C.</surname>
          </string-name>
          Peters (Eds.),
          <source>Information Retrieval Evaluation in a Changing World: Lessons Learned from 20 Years of CLEF</source>
          , Springer, Cham,
          <year>2019</year>
          . URL: https://link.springer.com/book/10.1007/ 978-3-
          <fpage>030</fpage>
          -22948-1. doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>030</fpage>
          -22948-1.
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <given-names>B.</given-names>
            <surname>Ionescu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.-C.</given-names>
            <surname>Stanciu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.-G.</given-names>
            <surname>Andrei</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Radzhabov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Prokopchuk</surname>
          </string-name>
          , Ştefan, LiviuDaniel, M.-G. Constantin,
          <string-name>
            <given-names>M.</given-names>
            <surname>Dogariu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Kovalev</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Damm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Bloch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. S.</given-names>
            <surname>Schmidt</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. M. G.</given-names>
            <surname>Pakull</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Bracke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Eryilmaz</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Becker</surname>
          </string-name>
          , W.-W. Yim,
          <string-name>
            <given-names>N.</given-names>
            <surname>Codella</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. A.</given-names>
            <surname>Novoa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Malvehy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Dimitrov</surname>
          </string-name>
          ,
          <string-name>
            <surname>R. J. Das</surname>
            ,
            <given-names>Z.</given-names>
          </string-name>
          <string-name>
            <surname>Xie</surname>
            ,
            <given-names>H. M.</given-names>
          </string-name>
          <string-name>
            <surname>Shan</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Nakov</surname>
            , I. Koychev,
            <given-names>S. A.</given-names>
          </string-name>
          <string-name>
            <surname>Hicks</surname>
            ,
            <given-names>S.</given-names>
          </string-name>
          <string-name>
            <surname>Gautam</surname>
            ,
            <given-names>M. A.</given-names>
          </string-name>
          <string-name>
            <surname>Riegler</surname>
            ,
            <given-names>V.</given-names>
          </string-name>
          <string-name>
            <surname>Thambawita</surname>
            ,
            <given-names>P.</given-names>
          </string-name>
          <string-name>
            <surname>Halvorsen</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Fabre</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Macaire</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Lecouteux</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Schwab</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Potthast</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Heinrich</surname>
            ,
            <given-names>J.</given-names>
          </string-name>
          <string-name>
            <surname>Kiesel</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Wolter</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Stein</surname>
          </string-name>
          , Overview of ImageCLEF 2025:
          <article-title>Multimedia retrieval in medical, social media and content recommendation applications, in: Experimental IR Meets Multilinguality</article-title>
          , Multimodality, and
          <string-name>
            <surname>Interaction</surname>
          </string-name>
          ,
          <source>Proceedings of the 16th International Conference of the CLEF Association (CLEF</source>
          <year>2025</year>
          ), Springer Lecture Notes in Computer Science LNCS, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Schaer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Bromuri</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEF 2016 medical task</article-title>
          ,
          <source>in: Working Notes of CLEF 2016 (Cross Language Evaluation Forum)</source>
          ,
          <year>2016</year>
          , pp.
          <fpage>219</fpage>
          -
          <lpage>232</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>C.</given-names>
            <surname>Eickhof</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Schwall</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of ImageCLEFcaption 2017 - image caption prediction and concept detection for biomedical images</article-title>
          ,
          <source>in: Working Notes of CLEF 2017 - Conference and Labs of the Evaluation Forum</source>
          , Dublin, Ireland,
          <source>September 11-14</source>
          ,
          <year>2017</year>
          .,
          <year>2017</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-1866/invited_paper_7.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Eickhof</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Andrearczyk</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEF 2018 caption prediction tasks</article-title>
          ,
          <source>in: Working Notes of CLEF 2018 - Conference and Labs of the Evaluation Forum</source>
          , Avignon, France,
          <source>September 10-14</source>
          ,
          <year>2018</year>
          .,
          <year>2018</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-
          <volume>2125</volume>
          /invited_ paper_4.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of the ImageCLEFmed 2019 concept detection task</article-title>
          , in: L.
          <string-name>
            <surname>Cappellato</surname>
            ,
            <given-names>N.</given-names>
          </string-name>
          <string-name>
            <surname>Ferro</surname>
            ,
            <given-names>D. E.</given-names>
          </string-name>
          <string-name>
            <surname>Losada</surname>
          </string-name>
          , H. Müller (Eds.),
          <source>Working Notes of CLEF 2019 - Conference and Labs of the Evaluation Forum, Lugano, Switzerland, September</source>
          <volume>9</volume>
          -
          <issue>12</issue>
          ,
          <year>2019</year>
          , volume
          <volume>2380</volume>
          <source>of CEUR Workshop Proceedings, CEUR-WS.org</source>
          ,
          <year>2019</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-
          <volume>2380</volume>
          /paper_245.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of the ImageCLEFmed 2020 concept prediction task: Medical image understanding</article-title>
          ,
          <source>in: CLEF2020 Working Notes</source>
          , volume
          <volume>1166</volume>
          <source>of CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Thessaloniki, Greece,
          <year>2020</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [9]
          <string-name>
            <given-names>O.</given-names>
            <surname>Bodenreider</surname>
          </string-name>
          ,
          <article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title>
          ,
          <source>Nucleic Acids Research</source>
          <volume>32</volume>
          (
          <year>2004</year>
          )
          <fpage>267</fpage>
          -
          <lpage>270</lpage>
          . doi:
          <volume>10</volume>
          .1093/nar/gkh061.
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [10]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Jacutprakart</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEFmed 2021 concept &amp; caption prediction task</article-title>
          ,
          <source>in: CLEF2021 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Bucharest, Romania,
          <year>2021</year>
          , pp.
          <fpage>1101</fpage>
          -
          <lpage>1112</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          [11]
          <string-name>
            <given-names>K.</given-names>
            <surname>Papineni</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Roukos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Ward</surname>
          </string-name>
          , W.-J. Zhu,
          <article-title>BLEU: a method for automatic evaluation of machine translation</article-title>
          ,
          <source>in: Proceedings of the 40th annual meeting of the Association for Computational Linguistics</source>
          ,
          <year>2002</year>
          , pp.
          <fpage>311</fpage>
          -
          <lpage>318</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref12">
        <mixed-citation>
          [12]
          <string-name>
            <given-names>T.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Kishore</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K. Q.</given-names>
            <surname>Weinberger</surname>
          </string-name>
          ,
          <string-name>
            <surname>Y. Artzi,</surname>
          </string-name>
          <article-title>BERTScore: Evaluating text generation with BERT</article-title>
          ,
          <source>in: 8th International Conference on Learning Representations, ICLR</source>
          <year>2020</year>
          ,
          <string-name>
            <given-names>Addis</given-names>
            <surname>Ababa</surname>
          </string-name>
          , Ethiopia,
          <source>April 26-30</source>
          ,
          <year>2020</year>
          ,
          <year>2020</year>
          . URL: https://openreview.net/forum?id=SkeHuCVFDr.
        </mixed-citation>
      </ref>
      <ref id="ref13">
        <mixed-citation>
          [13]
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , L. Bloch,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <surname>C.</surname>
          </string-name>
          <article-title>M. Friedrich, Overview of ImageCLEFmedical 2023 - caption prediction and concept detection</article-title>
          ,
          <source>in: CLEF2023 Working Notes</source>
          , volume
          <volume>3497</volume>
          <source>of CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Thessaloniki, Greece,
          <year>2023</year>
          , pp.
          <fpage>1328</fpage>
          -
          <lpage>1346</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref14">
        <mixed-citation>
          [14]
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Bloch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. S.</given-names>
            <surname>Schmidt</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Koitka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , H. Müller,
          <string-name>
            <given-names>P. A.</given-names>
            <surname>Horn</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Nensa</surname>
          </string-name>
          ,
          <string-name>
            <surname>C.</surname>
          </string-name>
          <article-title>M. Friedrich, ROCOv2: Radiology Objects in COntext version 2, an updated multimodal image dataset</article-title>
          ,
          <source>Scientific Data</source>
          <volume>11</volume>
          (
          <year>2024</year>
          )
          <article-title>688</article-title>
          . doi:
          <volume>10</volume>
          .1038/s41597-024-03496-6.
        </mixed-citation>
      </ref>
      <ref id="ref15">
        <mixed-citation>
          [15]
          <string-name>
            <given-names>A.</given-names>
            <surname>Chatzipapadopoulou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>I.</given-names>
            <surname>Pantelidis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Charalampakos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Samprovalaki</surname>
          </string-name>
          , G. Moschovis,
          <string-name>
            <given-names>P.</given-names>
            <surname>Kaliosis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Dalakleidi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Pavlopoulos</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Androutsopoulos</surname>
          </string-name>
          , AUEB NLP group at ImageCLEFmedical
          <source>Caption</source>
          <year>2025</year>
          , in: CLEF2025 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref16">
        <mixed-citation>
          [16]
          <string-name>
            <given-names>A. H. S.</given-names>
            <surname>Rudsari</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B. K.</given-names>
            <surname>Nejad</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Hajihosseini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Eetemadi</surname>
          </string-name>
          ,
          <article-title>Detecting concepts for medical images: Contributions of the DeepLens team at IUST to ImageCLEFmedical caption 2025</article-title>
          , in: CLEF2025 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref17">
        <mixed-citation>
          [17]
          <string-name>
            <given-names>M. V.</given-names>
            <surname>Luong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. H.</given-names>
            <surname>Dinh-Doan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G. P.</given-names>
            <surname>Bui-Hoang</surname>
          </string-name>
          , T. B.
          <string-name>
            <surname>Nguyen-Tat</surname>
          </string-name>
          ,
          <article-title>UIT-Oggy at ImageCLEFmedical 2024 caption: CSRA-enhanced concept detection and BLIP-driven vision-language captioning</article-title>
          ,
          <source>in: CLEF2025 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref18">
        <mixed-citation>
          [18]
          <string-name>
            <given-names>J.</given-names>
            <surname>He</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Ferdowsi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Feng</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Alves</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Platon</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Teodoro</surname>
          </string-name>
          , DS4DH group at ImageCLEFmedical caption
          <year>2025</year>
          , in: CLEF2025 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref19">
        <mixed-citation>
          [19]
          <string-name>
            <given-names>T. Sakthi</given-names>
            <surname>Mukesh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Beulah</surname>
          </string-name>
          , R. Muthulakshmi, ImageCLEF-medical
          <year>2025</year>
          :
          <article-title>MedCLIP model for medical caption prediction and concept detection</article-title>
          ,
          <source>in: Working Notes of the Conference and Labs of the CLEF Association (CLEF</source>
          <year>2025</year>
          ), Madrid, Spain,
          <year>2025</year>
          .
          <article-title>Notebook for the ImageCLEF Lab at CLEF</article-title>
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref20">
        <mixed-citation>
          [20]
          <string-name>
            <given-names>J.</given-names>
            <surname>Angulo</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Aguilar</surname>
          </string-name>
          ,
          <article-title>JJ-VMed: A framework for automated concepts, captions and explainability of medical image</article-title>
          ,
          <source>in: CLEF2025 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref21">
        <mixed-citation>
          [21]
          <string-name>
            <given-names>R.</given-names>
            <surname>Pan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. Bernal</given-names>
            <surname>Beltrán</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J. A.</given-names>
            <surname>García</surname>
          </string-name>
          <string-name>
            <surname>Díaz</surname>
          </string-name>
          , R. Valencia-García, UMUTeam at ImageCLEF 2025:
          <article-title>Fine-tuning a vision-language model for medical image captioning and SapBERT-based reranking for concept detection</article-title>
          ,
          <source>in: CLEF2025 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref22">
        <mixed-citation>
          [22]
          <string-name>
            <given-names>A.</given-names>
            <surname>Sahni</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Gupta</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. Venugopal</given-names>
            <surname>Reddy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Kalinathan</surname>
          </string-name>
          ,
          <article-title>Evaluating deep CNNs for multi-label concept detection in ROCOv2 radiology image dataset by team LekshmiscopeVIT</article-title>
          , in: CLEF2025 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref23">
        <mixed-citation>
          [23]
          <string-name>
            <given-names>R. N.</given-names>
            <surname>Chowdhury</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Hoque</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. R.</given-names>
            <surname>Hasan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E. P. O.</given-names>
            <surname>Oluwafemi</surname>
          </string-name>
          ,
          <string-name>
            <surname>M. M. Rahman</surname>
          </string-name>
          ,
          <article-title>Modality-guided radiology caption prediction with small vision-language models and image classifier</article-title>
          ,
          <source>in: CLEF2025 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Madrid, Spain,
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref24">
        <mixed-citation>
          [24]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. J.</given-names>
            <surname>Kim</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Shin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Lim</surname>
          </string-name>
          ,
          <article-title>A modular framework for clinically accurate medical image captioning using vision-language models</article-title>
          ,
          <source>in: Working Notes of the Conference and Labs of the CLEF Association (CLEF</source>
          <year>2025</year>
          ), Madrid, Spain,
          <year>2025</year>
          .
          <article-title>Notebook for the ImageCLEF Lab at CLEF</article-title>
          <year>2025</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref25">
        <mixed-citation>
          [25]
          <string-name>
            <given-names>R. J.</given-names>
            <surname>Roberts</surname>
          </string-name>
          , PubMed Central:
          <article-title>The GenBank of the published literature</article-title>
          ,
          <source>Proceedings of the National Academy of Sciences of the United States of America</source>
          <volume>98</volume>
          (
          <year>2001</year>
          )
          <fpage>381</fpage>
          -
          <lpage>382</lpage>
          . doi:
          <volume>10</volume>
          .1073/ pnas.98.2.381.
        </mixed-citation>
      </ref>
      <ref id="ref26">
        <mixed-citation>
          [26]
          <string-name>
            <given-names>Z.</given-names>
            <surname>Kraljevic</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Searle</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Shek</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Roguski</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Noor</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Bean</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Mascio</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Zhu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. A.</given-names>
            <surname>Folarin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Roberts</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Bendayan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. P.</given-names>
            <surname>Richardson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Stewart</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. D.</given-names>
            <surname>Shah</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W. K.</given-names>
            <surname>Wong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Ibrahim</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J. T.</given-names>
            <surname>Teo</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R. J.</given-names>
            <surname>Dobson</surname>
          </string-name>
          <article-title>, Multi-domain clinical natural language processing with MedCAT: The medical concept annotation toolkit</article-title>
          ,
          <source>Artificial Intelligence in Medicine</source>
          <volume>117</volume>
          (
          <year>2021</year>
          )
          <article-title>102083</article-title>
          . doi:
          <volume>10</volume>
          .1016/j. artmed.
          <year>2021</year>
          .
          <volume>102083</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref27">
        <mixed-citation>
          [27]
          <string-name>
            <given-names>A. E.</given-names>
            <surname>Johnson</surname>
          </string-name>
          , T. J.
          <string-name>
            <surname>Pollard</surname>
            ,
            <given-names>L.</given-names>
          </string-name>
          <string-name>
            <surname>Shen</surname>
            ,
            <given-names>L. wei H.</given-names>
          </string-name>
          <string-name>
            <surname>Lehman</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Feng</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Ghassemi</surname>
            , B. Moody, P. Szolovits,
            <given-names>L. A.</given-names>
          </string-name>
          <string-name>
            <surname>Celi</surname>
          </string-name>
          , R. G.
          <article-title>Mark, MIMIC-III, a freely accessible critical care database</article-title>
          ,
          <source>Scientific Data</source>
          <volume>3</volume>
          (
          <year>2016</year>
          ). doi:
          <volume>10</volume>
          .1038/sdata.
          <year>2016</year>
          .
          <volume>35</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref28">
        <mixed-citation>
          [28]
          <string-name>
            <given-names>J.</given-names>
            <surname>Devlin</surname>
          </string-name>
          , M.-
          <string-name>
            <given-names>W.</given-names>
            <surname>Chang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Toutanova</surname>
          </string-name>
          , BERT:
          <article-title>Pre-training of deep bidirectional transformers for language understanding</article-title>
          , in: J.
          <string-name>
            <surname>Burstein</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Doran</surname>
          </string-name>
          , T. Solorio (Eds.),
          <source>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          , Volume
          <volume>1</volume>
          (Long and Short Papers),
          <source>Association for Computational Linguistics</source>
          , Minneapolis, Minnesota,
          <year>2019</year>
          , pp.
          <fpage>4171</fpage>
          -
          <lpage>4186</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <fpage>N19</fpage>
          -1423.
        </mixed-citation>
      </ref>
      <ref id="ref29">
        <mixed-citation>
          [29]
          <string-name>
            <surname>C.-Y. Lin</surname>
            ,
            <given-names>ROUGE:</given-names>
          </string-name>
          <article-title>A package for automatic evaluation of summaries</article-title>
          , in: Text Summarization Branches Out,
          <source>Association for Computational Linguistics</source>
          ,
          <year>2004</year>
          , pp.
          <fpage>74</fpage>
          -
          <lpage>81</lpage>
          . URL: https: //aclanthology.org/W04-1013.
        </mixed-citation>
      </ref>
      <ref id="ref30">
        <mixed-citation>
          [30]
          <string-name>
            <given-names>T.</given-names>
            <surname>Sellam</surname>
          </string-name>
          ,
          <string-name>
            <surname>D. Das</surname>
            ,
            <given-names>A.</given-names>
          </string-name>
          <string-name>
            <surname>Parikh</surname>
          </string-name>
          ,
          <article-title>BLEURT: Learning robust metrics for text generation, in: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</article-title>
          , Association for
          <string-name>
            <given-names>M.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Wan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Ding</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Fu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Xu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Ye</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Xie</surname>
          </string-name>
          , Z. Cheng, H. Zhang,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Xu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Lin</surname>
          </string-name>
          ,
          <source>Qwen2.5-VL technical report</source>
          ,
          <year>2025</year>
          . URL: https://arxiv.org/abs/2502.13923. arXiv:
          <volume>2502</volume>
          .
          <fpage>13923</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref31">
        <mixed-citation>
          [60]
          <string-name>
            <given-names>A.</given-names>
            <surname>Kirillov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Mintun</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Ravi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Mao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Rolland</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Gustafson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Xiao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Whitehead</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. C.</given-names>
            <surname>Berg</surname>
          </string-name>
          , W.-Y. Lo,
          <string-name>
            <given-names>P.</given-names>
            <surname>Dollár</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Girshick</surname>
          </string-name>
          ,
          <article-title>Segment anything</article-title>
          , in: 2023 IEEE/CVF International Conference on Computer Vision (ICCV),
          <year>2023</year>
          , pp.
          <fpage>3992</fpage>
          -
          <lpage>4003</lpage>
          . doi:
          <volume>10</volume>
          .1109/ICCV51070.
          <year>2023</year>
          .
          <volume>00371</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref32">
        <mixed-citation>
          [61]
          <string-name>
            <given-names>G.</given-names>
            <surname>Jocher</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Chaurasia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Qiu</surname>
          </string-name>
          ,
          <string-name>
            <surname>Ultralytics</surname>
            <given-names>YOLOv8</given-names>
          </string-name>
          ,
          <year>2023</year>
          . URL: https://github.com/ultralytics/ ultralytics.
        </mixed-citation>
      </ref>
      <ref id="ref33">
        <mixed-citation>
          [62]
          <string-name>
            <given-names>A.</given-names>
            <surname>Vaswani</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Shazeer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Parmar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Uszkoreit</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Jones</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. N.</given-names>
            <surname>Gomez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Kaiser</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Polosukhin</surname>
          </string-name>
          ,
          <article-title>Attention is all you need</article-title>
          , in: I. Guyon, U. von Luxburg, S. Bengio,
          <string-name>
            <given-names>H. M.</given-names>
            <surname>Wallach</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Fergus</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S. V. N.</given-names>
            <surname>Vishwanathan</surname>
          </string-name>
          , R. Garnett (Eds.),
          <source>Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9</source>
          ,
          <year>2017</year>
          , Long Beach, CA, USA,
          <year>2017</year>
          , pp.
          <fpage>5998</fpage>
          -
          <lpage>6008</lpage>
          . URL: https://proceedings.neurips.cc/paper/2017/hash/ 3f5ee243547dee91fbd053c1c4a845aa-Abstract.html.
        </mixed-citation>
      </ref>
      <ref id="ref34">
        <mixed-citation>
          [63]
          <string-name>
            <given-names>R. R.</given-names>
            <surname>Selvaraju</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Cogswell</surname>
          </string-name>
          ,
          <string-name>
            <surname>A. Das</surname>
            ,
            <given-names>R.</given-names>
          </string-name>
          <string-name>
            <surname>Vedantam</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Parikh</surname>
            ,
            <given-names>D.</given-names>
          </string-name>
          <string-name>
            <surname>Batra</surname>
          </string-name>
          , Grad-CAM:
          <article-title>Visual explanations from deep networks via gradient-based localization</article-title>
          ,
          <source>in: 2017 IEEE International Conference on Computer Vision</source>
          (ICCV),
          <year>2017</year>
          , pp.
          <fpage>618</fpage>
          -
          <lpage>626</lpage>
          . doi:
          <volume>10</volume>
          .1109/ICCV.
          <year>2017</year>
          .
          <volume>74</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref35">
        <mixed-citation>
          [64]
          <string-name>
            <given-names>A.</given-names>
            <surname>Binder</surname>
          </string-name>
          , G. Montavon,
          <string-name>
            <given-names>S.</given-names>
            <surname>Lapuschkin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Müller</surname>
          </string-name>
          , W. Samek,
          <article-title>Layer-wise relevance propagation for neural networks with local renormalization layers</article-title>
          ,
          <source>in: Proceedings of the International Conference on Artificial Neural Networks (ICANN</source>
          <year>2016</year>
          ),
          <year>2016</year>
          . doi:
          <volume>10</volume>
          .1007/978-3-
          <fpage>319</fpage>
          -44781-
          <issue>0</issue>
          _
          <fpage>8</fpage>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>