<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-title-group>
        <journal-title>Conference and Labs of the Evaluation Forum, September</journal-title>
      </journal-title-group>
    </journal-meta>
    <article-meta>
      <title-group>
        <article-title>Overview of ImageCLEFmedical 2024 - Caption Prediction and Concept Detection</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Johannes Rückert</string-name>
          <email>johannes.rueckert@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Asma Ben Abacha</string-name>
          <email>abenabacha@microsoft.com</email>
          <xref ref-type="aff" rid="aff4">4</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alba G. Seco de Herrera</string-name>
          <email>alba.garcia@essex.ac.uk</email>
          <xref ref-type="aff" rid="aff5">5</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Louise Bloch</string-name>
          <email>louise.bloch@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Raphael Brüngel</string-name>
          <email>raphael.bruengel@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ahmad Idrissi-Yaghir</string-name>
          <email>ahmad.idrissi-yaghir@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henning Schäfer</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Benjamin Bracke</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hendrik Damm</string-name>
          <email>hendrik.damm@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Tabea M. G. Pakull</string-name>
          <email>tabeamargaretagrace.pakull@uk-essen.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff3">3</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Cynthia Sabrina Schmidt</string-name>
          <email>cynthia.schmidt@uk-essen.de</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Henning Müller</string-name>
          <xref ref-type="aff" rid="aff6">6</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Christoph M. Friedrich</string-name>
          <email>christoph.friedrich@fh-dortmund.de</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Spain</string-name>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Department of Computer Science, University of Applied Sciences and Arts Dortmund</institution>
          ,
          <addr-line>Dortmund</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Institute for Artificial Intelligence in Medicine (IKIM), University Hospital Essen</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff2">
          <label>2</label>
          <institution>Institute for Medical Informatics, Biometry and Epidemiology (IMIBE), University Hospital Essen</institution>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff3">
          <label>3</label>
          <institution>Institute for Transfusion Medicine, University Hospital Essen</institution>
          ,
          <addr-line>Essen</addr-line>
          ,
          <country country="DE">Germany</country>
        </aff>
        <aff id="aff4">
          <label>4</label>
          <institution>Microsoft</institution>
          ,
          <addr-line>Redmond, Washington</addr-line>
          ,
          <country country="US">USA</country>
        </aff>
        <aff id="aff5">
          <label>5</label>
          <institution>University of Essex</institution>
          ,
          <country country="UK">UK</country>
        </aff>
        <aff id="aff6">
          <label>6</label>
          <institution>University of Geneva</institution>
          ,
          <country country="CH">Switzerland</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2024</year>
      </pub-date>
      <volume>0</volume>
      <fpage>9</fpage>
      <lpage>12</lpage>
      <abstract>
        <p>The ImageCLEFmedical 2024 Caption task on caption prediction and concept detection follows similar challenges held from 2017-2023. The goal is to extract Unified Medical Language System (UMLS) concept annotations and/or define captions from image data. Predictions are compared to original image captions. Images for both tasks are part of the Radiology Objects in COntext version 2 (ROCOv2) dataset. For concept detection, multi-label predictions are compared against UMLS terms extracted from the original captions with additional manually curated concepts via the F1-score. For caption prediction, the semantic similarity of the predictions to the original captions is evaluated using the BERTScore. The task attracted strong participation with 50 registered teams, 14 teams submitted 82 graded runs for the two subtasks. Participants mainly used multi-label classification systems for the concept detection subtask, the winning team DBS-HHU utilized an ensemble of four diferent Convolutional Neural Networks (CNNs). For the caption prediction subtask, most teams used encoder-decoder frameworks with various backbones, including transformer-based decoders and Long Short-Term Memories (LSTMs), with the winning team PCLmed using medical vision-language foundation models (Med-VLFMs) by combining general and specialist vision models.</p>
      </abstract>
      <kwd-group>
        <kwd>eol&gt;ImageCLEF</kwd>
        <kwd>Computer Vision</kwd>
        <kwd>Multi-Label Classification</kwd>
        <kwd>Image Captioning</kwd>
        <kwd>Image Understanding</kwd>
        <kwd>Radiology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>ImageCLEF1 is the image retrieval and classification lab of the Conference and Labs of the Evaluation
Forum (CLEF) conference. ImageCLEF 2024 consists of the ImageCLEFmedical, ImageCLEFrecommending,
Image Retrieval for Augments (Touché) and ImageCLEFToPicto labs, with the ImageCLEFmedical lab
being divided into the subtasks Caption (Image Captioning), VQA (text-to-image generation),
MEDIQAMAGIC (Multimodal And Generative TelemedICine), and GANs (generation of medical images).</p>
      <p>
        The Caption task was first proposed as part of the ImageCLEFmedical [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ] in 2016. In 2017 and
2018 [
        <xref ref-type="bibr" rid="ref2 ref3">2, 3</xref>
        ] the ImageCLEFmedical caption task comprised two subtasks: concept detection and caption
prediction. In 2019 [
        <xref ref-type="bibr" rid="ref4">4</xref>
        ] and 2020 [
        <xref ref-type="bibr" rid="ref5">5</xref>
        ], the task concentrated on the concept detection subtask extracting
Unified Medical Language System ® (UMLS) Concept Unique Identifiers (CUIs) [
        <xref ref-type="bibr" rid="ref6">6</xref>
        ] from radiology
images.
      </p>
      <p>
        In 2021 [
        <xref ref-type="bibr" rid="ref7">7</xref>
        ], both subtasks, concept detection and caption prediction, were running again due to
participants demands. The focus in 2021 was on making the task more realistic by using fewer images
which were all manually annotated by medical doctors. As additional data of similar quality is hard
to acquire, the 2022 ImageCLEFmedical caption task [
        <xref ref-type="bibr" rid="ref8">8</xref>
        ] continued with both subtasks albeit with an
extended version of the Radiology Objects in COntext (ROCO) [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ] dataset used for both subtasks, which
was already used in 2020 and 2019. The 2023 edition of ImageCLEFmedical caption [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ] continued in
the same vein, once again using a ROCO-based dataset for both subtasks but switching from BiLingual
Evaluation Understudy (BLEU) [
        <xref ref-type="bibr" rid="ref11">11</xref>
        ] to BERTScore [
        <xref ref-type="bibr" rid="ref12">12</xref>
        ] as the primary evaluation metric for caption
prediction. For the 8th edition in 2024, additional metrics as well as an optional explainability extension
are introduced for the caption prediction.
      </p>
      <p>
        This paper sets forth the approaches for the caption task: automated cross-referencing of medical
images and captions into predicted coherent captions and UMLS concept detection in radiology images
as a separate subtask. This task is a part of the ImageCLEF benchmarking campaign, which has proposed
medical image understanding tasks since 2003; a new suite of tasks is generated each subsequent year.
Further information on the other proposed tasks at ImageCLEF 2024 can be found in Ionescu et al. [
        <xref ref-type="bibr" rid="ref13">13</xref>
        ].
      </p>
      <p>
        This is the 8th edition of the ImageCLEFmedical caption task. Just like in 2016 [
        <xref ref-type="bibr" rid="ref1">1</xref>
        ], 2017 [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ], 2018 [
        <xref ref-type="bibr" rid="ref3">3</xref>
        ],
2021 [
        <xref ref-type="bibr" rid="ref7">7</xref>
        ], 2022 [
        <xref ref-type="bibr" rid="ref8">8</xref>
        ], and 2023 [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ] both subtasks of concept detection and caption prediction are included
in ImageCLEFmedical 2024 Caption.
      </p>
      <p>Manual generation of the knowledge of medical images is a time-consuming process prone to
human error. As this process requires assistance for the better and easier diagnoses of diseases that are
susceptible to radiology screening, it is important that we better understand and refine automatic systems
that aid in the broad task of radiology-image metadata generation. The purpose of the ImageCLEFmedical
2024 caption prediction and concept detection tasks is the continued evaluation of such systems. Concept
detection and caption prediction information is applicable to unlabelled and unstructured datasets and
medical datasets that do not have textual metadata. The ImageCLEFmedical caption task focuses on the
medical image understanding in the biomedical literature and specifically on concept extraction and
caption prediction based on the visual perception of the medical images and medical text data such as
medical caption or UMLS CUIs paired with each image (see Figure 1).</p>
      <p>
        In 2024, for the development data, the newly released ROCOv2 [
        <xref ref-type="bibr" rid="ref14">14</xref>
        ] dataset, a new iteration of the
ROCO [
        <xref ref-type="bibr" rid="ref9">9</xref>
        ] dataset, was used, with new images from the PubMed Central® (PMC) [
        <xref ref-type="bibr" rid="ref15">15</xref>
        ] Open Access
subset added for the test set, while images from articles with licenses other than CC BY and CC BY-NC
were removed.
      </p>
      <p>This paper presents an overview of the ImageCLEFmedical 2024 Caption task including the task and
participation in Section 2, the data creation in Section 3, and the evaluation methodology in Section 4.
The results are described in Section 5, followed by conclusion in Section 6.</p>
      <sec id="sec-1-1">
        <title>1https://www.imageclef.org/ [last accessed: 2024-07-01]</title>
      </sec>
    </sec>
    <sec id="sec-2">
      <title>2. Task and Participation</title>
      <p>In 2024, the ImageCLEFmedical Caption task consisted of two subtasks: concept detection and caption
prediction.</p>
      <p>
        The concept detection subtask follows the same format proposed since the start of the task in 2017 [
        <xref ref-type="bibr" rid="ref2">2</xref>
        ].
Participants are asked to predict a set of concepts defined by the UMLS CUIs [
        <xref ref-type="bibr" rid="ref6">6</xref>
        ] based on the visual
information provided by the radiology images.
      </p>
      <p>
        The caption prediction subtask follows the original format of the subtask used between 2017 and
2018 [
        <xref ref-type="bibr" rid="ref2 ref3">2, 3</xref>
        ]. This subtask was paused and it is running again since 2021 because of participant demand.
This subtask aims to automatically generate captions for the radiology images provided. This year,
an optional new experimental explainability extension has been introduced for the caption prediction
task. This extension aims to improve the understanding of the models by asking participants to provide
explanations, such as heat maps or Shapley values [
        <xref ref-type="bibr" rid="ref16 ref17">16, 17</xref>
        ], for a selected number of images. These
explanations are manually reviewed to assess their efectiveness and clarity.
      </p>
      <p>
        In 2024, 50 teams registered and signed the End-User-Agreement that is needed to download the
development data. 14 teams submitted 82 graded runs for evaluation (13 teams submitted working
notes) attracting a similar number of teams as in 2023 [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ], with an overall lower number of graded
runs. Each of the groups was allowed a maximum of 10 graded runs per subtask.
      </p>
      <p>
        Table 1 shows all the teams who participated in the task and their submitted runs. This year, 9 teams
participated in the concept detection subtask, 3 of those teams also participated in 2023 [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ]. Of the 11
teams that submitted runs to the caption prediction subtask, 5 also participated in 2023. 3 of the teams
participated also in 2022. Overall, 6 teams participated in both subtasks, and 5 teams participated only
in the caption prediction subtask. Unlike in 2023, 3 teams participated only in the concept detection
subtask.
      </p>
    </sec>
    <sec id="sec-3">
      <title>3. Data Creation</title>
      <p>
        Like last year, a dataset that originates from biomedical articles of the PMC Open Access Subset2 [
        <xref ref-type="bibr" rid="ref15">15</xref>
        ]
was used and was extended with new images added since the last time the dataset was updated in
      </p>
      <sec id="sec-3-1">
        <title>2https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/ [last accessed: 2024-07-01]</title>
        <p>
          Team
SSNMLRGKSR* [
          <xref ref-type="bibr" rid="ref21">21</xref>
          ]
CS_Morgan* [
          <xref ref-type="bibr" rid="ref22">22</xref>
          ]
UACH-VisionLab [23]
MICLab [24]
Department of Informatics, Athens University of
Economics and Business, Athens, Greece
Heinrich-Heine-Universität Düsseldorf, Düsseldorf,
Germany
University of Information Technology, Ho Chi Minh
City, Vietnam
Department of CSE, Sri Sivasubramaniya Nadar
College of Engineering, Chennai, India
Computer Science Department, Morgan State
University, Baltimore, Maryland
Facultad de Ingeniería, Universidad Autónoma de
Chihuahua, Chihuahua, Mexico
School of Electrical and Computer Engineering,
Universidade Estadual de Campinas, Campinas,
Brazil
Department of CSE, SSN College of Engineering,
Chennai, India
Peng Cheng Laboratory, Shenzhen, China and
ADSPLAB, School of Electronic and Computer
Engineering, Peking University, Shenzhen, China
Vellore Institute of Technology (VIT), Chennai, India
KDE Laboratory, Department of Computer Science
and Engineering, Toyohashi University of
Technology, Aichi, Japan
University of Information Technology, Ho Chi Minh
City, Vietnam
Faculty of Information Science and Engineering,
University of Information Technology, Ho Chi Minh
City, Vietnam
October 2022. An advantage of using new images for the test set is that contamination of models trained
on PMC data is not an issue, since the models in use today were mostly trained prior to 2023. The
development dataset for this year consists of the images from the newly released ROCOv2 [
          <xref ref-type="bibr" rid="ref14">14</xref>
          ] dataset.
        </p>
        <p>Once again, no extensive caption pre-processing beyond the removal of links was performed to keep
the captions as realistic as possible. Captions in languages other than English were removed.</p>
        <p>From the resulting captions, concepts were extracted using the Medical Concept Annotation Toolkit
(MedCAT) [31]. MedCAT, which is capable of extracting biomedical concepts from unstructured text,
was trained on the Medical Information Mart for Intensive Care (MIMIC)-III dataset [32] and links
to Systematized Nomenclature of Medicine and Clinical Terms (SNOMED CT) IDs, which were later
mapped to CUIs and Type Unique Identifiers (TUIs) of the UMLS2022AB release 3. During concept
extraction, concepts were retained only if they exceeded a frequency threshold of 10 occurrences, and
semantic filters were applied to focus on visually observable and interpretable concepts. For example,
concepts of semantic type T029 (Body Location or Region) or T060 (Diagnostic Procedure) are relevant,
while concepts of semantic type T054 (Social Behavior) cannot be derived from the image if it would
appear in the caption. In addition, manual filtering was performed to exclude UMLS concepts that were
either incorrectly detected by the pipeline or were still not related to the image content in any way
after semantic filtering. Blacklisted concepts often include qualifiers that would divert actual interest to,
3https://www.nlm.nih.gov/pubs/techbull/nd22/nd22_umls_2022ab_release_available.html [last accessed: 2024-07-01]
for example, anatomical localization or a pathological process, and would also introduce bias, since
qualifiers are used in a highly individual and variable manner. Entity linking systems tend to link
concepts with ambiguous synonyms incorrectly, e.g. C0994894 (Patch Dosage Form) may be linked if
the caption refers to a region that is patchy. In case of high frequency occurrence of such concepts,
they were merged to the correct concept via mapping.</p>
        <p>Additional concepts were assigned to all images addressing their image modality. Six medical image
modalities of concepts were covered: X-ray, Computer Tomography (CT), Magnetic Resonance Imaging
(MRI), ultrasound, and Positron Emission Tomography (PET) as well as modality combinations (e.g.,
PET/CT) as standalone concept. For images of the X-ray modality further concepts on the represented
anatomy were assigned, covering specific anatomical body regions of the Image Retrieval in Medical
Application (IRMA) [33] classification: cranium, spine, upper extremity/arm, chest, breast/mamma,
abdomen, pelvis, and lower extremity/leg. New for last year’s dataset was the addition of manually
validated directionality concepts for x-ray images. Directionality refers to the x-ray imaging orientation
according to IRMA: coronal posteroanterior (PA), coronal anteroposterior (AP), sagittal, or transversal.
These concepts were not included in this year’s dataset because the medical expertise and time to both
ensure the quality of the directionality concepts for the development dataset as well as validate new
directionality concepts on the test set was not available. Table 2 shows statistics about the number of
concepts for the datasets of the last three years.</p>
        <p>The following subsets were distributed to the participants where each image has one caption and
one or more concepts (UMLS-CUI):
• Training set including 70,108 radiology images and associated captions and concepts, with a total
of 220,859 concept occurrences and 1945 unique concepts.
• Validation set including 9972 radiology images and associated captions and concepts, with a total
of 32,060 concept occurrences and 1751 unique concepts.
• Test set including 17,237 radiology images, with a total of 48,563 concept occurrences and 700
unique concepts.</p>
      </sec>
    </sec>
    <sec id="sec-4">
      <title>4. Evaluation Methodology</title>
      <p>In this year’s edition, the performance evaluation for the concept detection subtask is carried out in the
same way as last year. Both tasks are evaluated separately. The AI4MediaBench4 by AIMultimediaLab5
was used as the challenge platform. Like last year, participants were unaware of their own scores on the</p>
      <sec id="sec-4-1">
        <title>4https://ai4media-bench.aimultimedialab.ro/ [last accessed: 2024-07-01] 5https://www.aimultimedialab.ro/ [last accessed: 2024-07-01]</title>
        <p>Year</p>
        <p>Split
train
2022 valid
test
train
2023 valid
test
train
2024 valid
test
test set until after the submission deadline. This was done to avoid teams optimizing their approaches
based on test set results, which would amount to information leakage.</p>
        <p>For the concept detection subtask, the balanced precision and recall trade-of were measured in terms
of F1-scores. Like last year, a secondary F1-score is computed using a subset of concepts that was
manually curated. On the one hand, this involves the diferent image modalities (X-ray, Angiography,
Ultrasound, CT, MRI, PET, and Combined such as PET/CT). On the other hand, if applicable, for X-ray
also the most prominently depicted body region (cranium, chest, upper extremity, spine, abdomen,
pelvis, and lower extremity) was involved.</p>
        <p>
          As a pre-processing step for evaluating the second task, all captions were lowercased, punctuation
was removed, and numbers were replaced by the token “number”. This step ensures uniformity and
focuses the evaluation on the linguistic content. The performance of caption prediction is evaluated
based on BERTScore [
          <xref ref-type="bibr" rid="ref12">12</xref>
          ], which is a metric that computes a similarity score for each token in the
generated text with each token in the reference text. It uses the pre-trained contextual embeddings
from Bidirectional Encoder Representations from Transformers (BERT) [34]-based models and matches
words by cosine similarity. In this work, the pre-trained model microsoft/deberta-xlarge-mnli6 was
used because it is the model that correlates best with human scoring according to the authors7. Since
evaluating generated text and image captioning is very challenging and should not be based on a single
metric, additional evaluation metrics were explored in this year’s edition in order to find the metrics
that correlate well with human judgments for this task. First, the Recall-Oriented Understudy for
Gisting Evaluation (ROUGE) [35] score was adopted as a secondary metric that counts the number of
overlapping units such as n-grams, word sequences, and word pairs between the generated text and the
reference. Specifically, the ROUGE-1 (F-measure) score was calculated, which measures the number of
matching unigrams between the model-generated text and a reference. All individual scores for each
caption are then summed and averaged over the number of captions, resulting in the final score. In
addition to ROUGE, the Metric for Evaluation of Translation with Explicit ORdering (METEOR) [36] was
explored, which is a metric that evaluates the generated text by aligning it to reference and calculating
a sentence-level similarity score. Furthermore, the Consensus-based Image Description Evaluation
(CIDEr) [37] metric was also adopted. CIDEr is an automatic evaluation metric that calculates the
weights of n-grams in the generated text, and the reference text based on Term Frequency and Inverse
Document Frequency (TF-IDF) and then compares them based on cosine similarity. Another metric
used is the BiLingual Evaluation Understudy (BLEU) score [
          <xref ref-type="bibr" rid="ref11">11</xref>
          ], which is a geometric mean of n-gram
scores from 1 to 4. For this task, the focus was on the BLEU-1 score, which takes into account unigram
precision. BiLingual Evaluation Understudy with Representations from Transformers (BLEURT) [38] is
specifically designed to evaluate natural language generation in English. It uses a pre-trained model that
has been fine-tuned to emulate human judgments about the quality of the generated text. The strength
of BLEURT lies in its end-to-end training, which enables it to model human judgments efectively
and makes it robust to domain and quality variations. For this evaluation, the BLEURT-20 model
was used. CLIPScore [39] is an innovative metric that diverges from the traditional reference-based
evaluations of image captions. Instead, it aligns with the human approach of evaluating caption quality
without references by evaluating the alignment between text and image content. The metric employs
Contrastive Language-Image Pretraining (CLIP) [40], a cross-modal model that has been pre-trained on a
massive dataset of 400 million image-caption pairs sourced from the web. The model is used to compute
similarity scores between images and text. In addition to the reference-free CLIPScore, this evaluation
also considers RefCLIPScore [39], an extension that incorporates reference captions. This year, two new
domain-specific metrics, MedBERTScore and ClinicalBLEURT [ 41], have been added to the evaluation.
These metrics are tailored for evaluating text in medical contexts and aim to better assess the relevance
and accuracy of the generated medical content. MedBERTScore enhances the traditional BERTScore by
assigning higher weights to medically relevant terms identified in the text. ClinicalBLEURT is a version
of BLEURT fine-tuned on large collections of family medicine and orthopedic notes to better capture
        </p>
      </sec>
      <sec id="sec-4-2">
        <title>6https://huggingface.co/microsoft/deberta-xlarge-mnli [last accessed: 2023-07-01] 7https://github.com/Tiiiger/bert_score [last accessed: 2023-07-01]</title>
        <p>Group Name
DBS-HHU
auebnlpgroup
DS@BioMed
SSNMLRGKSR
UACH-VisionLab
MICLabNM
Kaprov
VIT_ConceptZ
CS_Morgan
the characteristics of the medical language.</p>
      </sec>
    </sec>
    <sec id="sec-5">
      <title>5. Results</title>
      <p>For the concept detection and caption prediction subtasks, Tables 3 and 4 show the best results from
each of the participating teams. The results will be discussed in this section. The full list of results are
shown in Appendix A in Tables 7, 8 and 9.</p>
      <sec id="sec-5-1">
        <title>5.1. Results for the Concept Detection Subtask</title>
        <p>
          In 2024, 9 teams participated in the concept prediction subtask, submitting 38 graded runs. Table 3
presents the best results for each team achieved in the submissions.
DBS-HHU [
          <xref ref-type="bibr" rid="ref19">19</xref>
          ] Dethroning the winners of the last several years, the DBS-HHU team achieved the
best F1-scores of 0.6375 (primary) and 0.9534 (secondary) by using an ensemble of four diferent
Convolutional Neural Networks (CNNs): ResNet-152 [42], EficientNet-B0 [ 43], DenseNet-201 [44],
and Wide ResNet-101-2 [45], all pre-trained on ImageNet [46] and followed by diferent
FeedForward Neural Networks (FFNNs). Additionally, they experimented with building a hierarchical
system of several models, specifically oriented towards the AUEB-NLP-Group’s approach of prior
years. However, these did not beat the best results of their first strategy.
        </p>
        <p>
          AUEB-NLP-Group [
          <xref ref-type="bibr" rid="ref18">18</xref>
          ] The AUEB-NLP-Group based their approach on their past work, which won
the competition in the last several years, by combining a CNN (DenseNet [44]) followed by a
FFNN classification head which achieved a close second place with a primary F1-score of 0.6319
and a secondary F1-score of 0.9393. They also experimented with CNNs followed by -Nearest
Neighbor (k-NN) models and ensembles which performed slightly worse.
        </p>
        <p>
          DS@BioMed [
          <xref ref-type="bibr" rid="ref20">20</xref>
          ] The DS@BioMed team employed a Shifted Window Transformer v2 (Swin-v2) [47]
to achieve an F1-score of 0.6200 and a secondary F1-score of 0.9312. They also experimented
with other transformer-based architectures, as well as CNNs and ensembles.
        </p>
        <p>
          SSNMLRGKSR [
          <xref ref-type="bibr" rid="ref21">21</xref>
          ] The SSNMLRGKSR team used a DenseNet-121 [44] CNN for their best approach
which achieved a primary F1-score of 0.6001 and a secondary F1-score of 0.9056.
UACH-VisionLab [23] The UACH-VisionLab team used several EficientNet-B0 [ 43] models trained
for diferent sub-groups of concepts to achieve a primary F1-score of 0.5988 and a secondary
F1-score of 0.9363.
        </p>
        <p>MICLabNM [24] The MICLabNM team employed a VisualT5 image-to-text encoder-decoder
architecture coupling a Vision Transformer (ViT) [48] with an encoder-decoder T5 [49] text transformer
achieving F1-scores of 0.5795 and 0.8835.</p>
        <p>Kaprov [25] The Kaprov team utilized a CNN-LSTM model, achieving a primary F1-score of 0.4609
and a secondary F1-score of 0.7301
VIT_Conceptz [27] The VIT_Conceptz team used a ResNet50 [42] CNN to achieve F1-scores of 0.1812
and 0.2647.</p>
        <p>
          CS_Morgan [
          <xref ref-type="bibr" rid="ref22">22</xref>
          ] The CS_Morgan team experimented with a ConvMixer [50] model which consists of
a combination of CNN and Transformer architectures achieving F1-scores of 0.1076 and 0.2105.
        </p>
        <p>To summarize, in the concept detection subtasks, the groups used primarily multi-label classification
systems, with one team integrating image retrieval systems in some of their approaches. Most teams
used CNNs to extract features for images. Some teams explored Transformer-based [51] models, such
as ViTs [48], while one team used a ConvMixer [50] architecture, blending convolutional networks and
ViTs. The winning team this year utilized an ensemble of four diferent CNNs.</p>
        <p>Comparing this year’s concept detection task results to those of the last year’s ImageCLEFmedical
Caption, a remarkable increase of achieved F1-Scores can be observed. For a direct comparison, last
year’s winner and now second best AUEB-NLP-Group managed to increase their F1-Score from 0.5223
to 0.6319, close to team DBS-HHU’s winning F1-Score of 0.6375. This increase is much smaller for
the secondary F1-Score, where the AUEB-NLP-Group increased their score from 0.9258 to 0.9393, and
DBS-HHU achieved a new all-time high of 0.9534. By training and evaluating our own baseline model
on the data from this year, we could determine that about 0.1 of the diference in primary F1-score is
purely due to the new test dataset, which contains a much smaller number of unique concepts (see
Table 2). One diference in this year’s dataset compared to last year’s is that the newly added images
were fully used for the test dataset and not split into validation and test, resulting in a larger test dataset.
On the other hand, the number of unique concepts in the test dataset is much lower than last year,
indicating a diference in the newly added data. The practice of updating the test set with the latest
images from the PMC Open Access subset can lead to such complications. Further improvements
in primary and secondary F1-score can be attributed to continuous changes and improvements of
the challenge dataset, e.g., correction of previous errors and further refinement of quality assurance
measures as well as improvements and scaling of the teams’ approaches.</p>
      </sec>
      <sec id="sec-5-2">
        <title>5.2. Results for the Caption Prediction Subtask</title>
        <p>In this 8th edition, the caption prediction subtask attracted 11 teams which submitted 53 graded runs.
Tables 4, 5 and 6 present the results of the submissions.</p>
        <p>
          PCLmed [26] The winning team introduced Medical Vision-Language Foundation Models
(MedVLFM) with Vision Encoder Ensembling (VEE) for better representing the content of medical
images and Modality-Aware Adaptation (MAA) to take the inference between vision and text
modalities into account. An ensemble of a Explore the limits of Visual representation at scAle
(EVA)-ViT-g [
          <xref ref-type="bibr" rid="ref23">52</xref>
          ] model which was pre-trained on natural images and a BioMedCLIP [
          <xref ref-type="bibr" rid="ref24">53</xref>
          ] model
pre-trained on medical images was implemented for image encoding. Pangu- [
          <xref ref-type="bibr" rid="ref25">54</xref>
          ] has been
used as the Large Language Model (LLM) for text generation. The model reached a BERTScore of
0.6299 and a ROUGE score of 0.2726 and won the caption prediction task.
        </p>
        <p>
          CS_Morgan [
          <xref ref-type="bibr" rid="ref22">22</xref>
          ] The CS_Morgan team experimented with diferent Large Multimodal Models (LMMs)
like Large Language and Vision Assistant (LLaVA) [
          <xref ref-type="bibr" rid="ref26">55</xref>
          ], IDEFICS [
          <xref ref-type="bibr" rid="ref27">56</xref>
          ], and MoonDream28. The
results of these models are compared to conventional encoder-decoder models like VisionGPT2
        </p>
        <sec id="sec-5-2-1">
          <title>8https://huggingface.co/vikhyatk/moondream2 [last accessed: 2024-07-01]</title>
          <p>and CNN-Transformer architectures. The best-performing model of the team was a fine-tuned
LLaVA 1.6 Mistral 7B. This model achieved a BERTScore of 0.6281 and a ROUGE score of 0.2508.
DarkCow [30] The DarkCow team obtained a BERTScore of 0.6267 and a ROUGE score of 0.2452.</p>
          <p>
            A VinVL [
            <xref ref-type="bibr" rid="ref28">57</xref>
            ] model was used to extract object features from the images. These features were
combined with more general visual features extracted using a ViT [48] model. ClinicalT5- [
            <xref ref-type="bibr" rid="ref29">58</xref>
            ]
and Biomedical Bidirectional and Auto-Regressive Transformers (BioBART) [
            <xref ref-type="bibr" rid="ref30">59</xref>
            ]-based models
were used for the caption generation. The best results were achieved for the BioBART model.
AUEB-NLP-Group [
            <xref ref-type="bibr" rid="ref18">18</xref>
            ] The AUEB-NLP-Group’s approach on caption prediction involved four
primary systems: The first one employing a InstructBLIP [
            <xref ref-type="bibr" rid="ref31">60</xref>
            ] model, and the other ones building
up upon it, applying a synthesizer, a rephraser, and an innovative Distance from Median
Maximum Concept Similarity (DMMCS) mechanism. One combination of InstructBLIP with DMMCS
achieved the team’s best BERTscore of 0.6211 and a ROUGE score of 0.2049.
2Q2T [29] The 2Q2T team used the Bootstrapping Language-Image Pre-training (BLIP) [
            <xref ref-type="bibr" rid="ref32">61</xref>
            ]
architecture as their main approach, which combines a ViT [48] as the encoder while using BERT [34]
for text generation. They yielded a BERTScore of 0.6178 and ROUGE score of 0.2478 for caption
prediction.
          </p>
          <p>
            MICLabNM [24] The MICLabNM team used a model that combines a ViT [48] with ClinicalT5 [
            <xref ref-type="bibr" rid="ref29">58</xref>
            ],
called VisualT5. The approach also features a modified spatial attention module for interpretability,
by highlighting important image areas for model decisions. The approach achieved a 0.6129
BERTScore and a ROUGE score of 0.2135 for caption prediction.
          </p>
          <p>DLNU_CCSE The team’s approach achieved a BERTScore of 0.6066 and a ROUGE score of 0.2179,
with no working notes submitted by the team.</p>
          <p>
            Kaprov [25] The Kaprov team implemented a combination of a Visual Geometry Group (VGG)-16
[
            <xref ref-type="bibr" rid="ref33">62</xref>
            ]based CNN and a Long Short-Term Memory (LSTM) [
            <xref ref-type="bibr" rid="ref34">63</xref>
            ] model for the caption prediction task.
          </p>
          <p>
            The team achieved a BERTScore of 0.5964 and a ROUGE score of 0.1905 on the private test set.
DS@BioMed [
            <xref ref-type="bibr" rid="ref20">20</xref>
            ] The best performing-model which was submitted by the DS@BioMed team
implemented a combination of a BERT [34] Pre-Training of Image Transformers (BEiT) [
            <xref ref-type="bibr" rid="ref35">64</xref>
            ] and an
BioBART [
            <xref ref-type="bibr" rid="ref30">59</xref>
            ] model. This model incorporated the information which was extracted from the
medical images with the concepts extracted in the concept detection task. The team achieved a
BERTScore of 0.5794 and a ROUGE score of 0.1031 on the private test set.
          </p>
          <p>
            DBS-HHU [
            <xref ref-type="bibr" rid="ref19">19</xref>
            ] The DBS-HHU team based their caption prediction approach on simple pre-processing
(lowercasing, punctuation removal, numbers exchange with number token) to focus on linguistic
content. Two models, fine-tuned Generative Image-to-text Transformer (GIT) [
            <xref ref-type="bibr" rid="ref36">65</xref>
            ] -base and
GIT-large, were then employed for caption generation. Both models achieved nearly equal scores,
with the large model achieving the higher BERTscore of 0.5769 and a ROUGE score of 0.1531.
KDE-MED-CAPTION [28] The KDE-MED-CAPTION team implemented a caption retrieval approach.
          </p>
          <p>
            First, a priority-based partitioning was implemented. Afterwards, EficientNet [ 43], ResNeXt [
            <xref ref-type="bibr" rid="ref37">66</xref>
            ],
and ViT [48] models were trained for concept detection. These models were used for feature
extraction. Similarity measures were used to compare the extracted features from the test samples
with the training samples. The caption of the most similar training sample is predicted for a test
sample. The best model submitted by the KDE-MED-CAPTION team reached an BERTScore of
0.5673 and a ROUGE score of 0.1325.
          </p>
          <p>
            To summarize, in the caption prediction subtask teams primarily utilized encoder-decoder
frameworks with various backbones, including transformer-based decoders and LSTMs [
            <xref ref-type="bibr" rid="ref34">63</xref>
            ]. ViTs [48] were
commonly employed for feature extraction. Some approaches integrated concept detection into the
caption generation process by providing predicted concepts as input to the encoder along with the
images. This year saw a notable increase in the use of LLMs such as BioBART [
            <xref ref-type="bibr" rid="ref30">59</xref>
            ] and ClinicalT5 [
            <xref ref-type="bibr" rid="ref29">58</xref>
            ]
and Vision Language Models (VLMs), including LLaVA [
            <xref ref-type="bibr" rid="ref26">55</xref>
            ] and IDEFICS [
            <xref ref-type="bibr" rid="ref27">56</xref>
            ], with some teams
experimenting with visual instruction tuning. Only one team used a retrieval-based approach for this
approach. The winning team introduced medical vision-language foundation models (Med-VLFMs) by
combining general and specialist vision models to achieve top rankings in the challenge.
          </p>
          <p>This is the second iteration of the caption prediction subtask which used BERTScore and ROUGE
as primary and secondary evaluation metrics, after BLEU-1 had been used as the primary evaluation
metric in all previous iterations. While some teams were still mainly optimizing for the BLEU-1 score
last year, resulting in a wide spread of scores for the diferent metrics with some teams scoring very
strongly in some metrics and very weakly in others, the scores were much more even this year, with
the winning approach scoring strongly across all metrics.</p>
          <p>Even though last year’s winning team CSIRO achieved an all-time high BERTScore of 0.6425, a
notable overall increase is visible in returning teams’ scores. E.g., this year’s winning team PCLmed
increased their prior score from 0.6152 to 0.6299. The same applies for other teams CS_Morgan (0.5819
vs. 0.6281), the AUEB-NLP-Group (0.6170 vs. 0.6211), and team DLNU_CCSE (0.6005 vs. 0.6066). Such
notable increases are observable for the other scores ROUGE, BLEURT, CIDEr, METEOR, and CLIPScore
as well. The main reasons for the improvements are likely continuous improvements of the teams’
approaches, while experimentation with new approaches did not yield breakthrough improvements.
The newly introduced metrics ClinicalBLEURT and MedBERTScore grant additional insight.</p>
          <p>The new optional explainability extension was not adpoted by the teams, only the team
MICLabNM [24] submitted explainability results after the end of the submission phase.</p>
        </sec>
      </sec>
    </sec>
    <sec id="sec-6">
      <title>6. Conclusion</title>
      <p>
        This year’s caption task of ImageCLEFmedical once again ran with both subtasks, concept detection
and caption prediction. It used the newly released ROCOv2 [
        <xref ref-type="bibr" rid="ref14">14</xref>
        ] as the development dataset. It attracted
14 teams who submitted 82 graded runs using for the first time the AI4MediaBench platform. For the
concept detection task, the F1-score and a secondary F1-score, considering only the manually curated
concepts, were used. After changing the primary evaluation metric for the caption prediction subtask
from BLEU to BERTScore for last year, additional, more domain-specific metrics were added for this
year, one of which may be used as the primary metric for next year. The caption prediction subtask was
again more popular than the concept detection subtask this year, with 6 teams participating in both
subtasks, 5 teams participating only in the caption prediction subtask, and 3 teams only participating
in the concept detection subtask. As before, the teams generally approached the tasks completely
separately, with only the DS@BioMed team using the generated concepts for the predicted captions.
      </p>
      <p>
        Like in the 2023 challenge [
        <xref ref-type="bibr" rid="ref10">10</xref>
        ], teams generally used multi-label classification systems for the concept
detection subtask, with the winning team using an ensemble of four CNNs. Only one team integrated
image retrieval systems in some of their approaches. For the caption prediction subtask,
encoderdecoder frameworks were used by most teams, with ViTs being used to extract features. LLMs were
increasingly being used to generate and fine-tune the captions. The winning approach used Med-VLFMs
by combining general and specialist vision models.
      </p>
      <p>For the concept detection subtask, the overall primary F1-scores increased strongly compared to
last year despite very similar approaches being employed by the teams. In addition to continuously
improved and scaled-up approaches by the teams, a large part of the improvement can be explained by
a lower number of unique concepts in the test set compared to last year.</p>
      <p>
        The same applies for the general view on results of this year’s caption prediction task. The top
scores were slightly worse for BERTScore, but last year’s winners CSIRO [
        <xref ref-type="bibr" rid="ref38">67</xref>
        ] did not participate this
year. Returning teams improved their scores across the board showing that the dataset for this year is
comparable to last year for the caption prediction and that while teams have experimented with many
diferent approaches including LLMs for caption generation, no breakthrough improvement has been
achieved with these new techniques.
      </p>
      <p>For next year’s ImageCLEFmedical Caption challenge, some possible improvements include an
improved caption prediction evaluation metric which is specific to medical texts, as well as additional
metrics for readability and factuality. A comprehensive analysis of diferent metrics is planned to
determine whether they should be used as primary indicators or whether a combination of diferent
metrics would be more appropriate for this task, given the complex nature of evaluating generated
captions.</p>
      <p>An additional focus will be explainability. The optional extension to the caption prediction subtask
where participants were asked to provide explainability results for a small subset of images was not
adopted by the participants, with only a single team submitting explainability results after the end of
the submission phase. For next year, examples will be provided for how these explainability results
could look and it might be extracted into its own subtask.</p>
    </sec>
    <sec id="sec-7">
      <title>Acknowledgments</title>
      <p>This work was partially supported by the University of Essex GCRF QR Engagement Fund provided
by Research England (grant number G026). The work of Louise Bloch, Benjamin Bracke and Raphael
Brüngel was partially funded by a PhD grant from the University of Applied Sciences and Arts Dortmund
(FH Dortmund), Germany. The work of Ahmad Idrissi-Yaghir, Henning Schäfer, Tabea M. G. Pakull and
Hendrik Damm was funded by a PhD grant from the DFG Research Training Group 2535
Knowledgeand data-based personalisation of medicine at the point of care (WisPerMed).
CEUR-WS.org, Grenoble, France, 2024.
[23] A. Moncloa-Muro, G. Ramirez-Alonso, F. Martinez-Reyes, Automatic medical concept detection
on images: dividing the task into smaller ones, in: CLEF2024 Working Notes, CEUR Workshop
Proceedings, CEUR-WS.org, Grenoble, France, 2024.
[24] D. Carmo, L. Rittner, R. Lotufo, VisualT5: Multitasking caption and concept prediction with
pre-trained ViT, T5 and customized spatial attention in radiological images, in: CLEF2024 Working
Notes, CEUR Workshop Proceedings, CEUR-WS.org, Grenoble, France, 2024.
[25] P. Balasundaram, K. Swaminathan, O. Sampath, P. KM, Concept detection and caption prediction
of radiology images using convolutional neural networks, in: CLEF2024 Working Notes, CEUR
Workshop Proceedings, CEUR-WS.org, Grenoble, France, 2024.
[26] B. Yang, Y. Yu, Y. Zou, T. Zhang, PCLmed: Champion solution for ImageCLEFmedical 2024 caption
prediction challenge via medical vision-language foundation models, in: CLEF2024 Working Notes,
CEUR Workshop Proceedings, CEUR-WS.org, Grenoble, France, 2024.
[27] S. Ram, S. Vinoth, R. N. Gopalakrishnan, A. A. Balakumar, L. Kalinathan, T. A. J. Velankanni,
Leveraging diverse CNN architectures for medical image captioning: DenseNet-121, MobileNetV2,
and ResNet-50 in ImageCLEF 2024, in: CLEF2024 Working Notes, CEUR Workshop Proceedings,
CEUR-WS.org, Grenoble, France, 2024.
[28] M. Aono, T. Asakawa, K. Shimizu, K. Nomura, Medical image captioning using CUI-based
classification and feature similarity, in: CLEF2024 Working Notes, CEUR Workshop Proceedings,
CEUR-WS.org, Grenoble, France, 2024.
[29] T. V. Phan, T. K. Nguyen, Q. A. Hoang, Q. T. Phan, T. B. Nguyen-Tat, MedBLIP: Multimodal
medical image captioning using BLIP, in: CLEF2024 Working Notes, CEUR Workshop Proceedings,
CEUR-WS.org, Grenoble, France, 2024.
[30] Q. V. Nguyen, Q. H. Pham, D. Q. Tran, T. K.-B. Nguyen, N.-H. Nguyen-Dang, B.-T. Nguyen-Tat,
UITDarkCow team at ImageCLEFmedical caption 2024: Diagnostic captioning for radiology images
eficiency with transformer models, in: CLEF2024 Working Notes, CEUR Workshop Proceedings,
CEUR-WS.org, Grenoble, France, 2024.
[31] Z. Kraljevic, T. Searle, A. Shek, L. Roguski, K. Noor, D. Bean, A. Mascio, L. Zhu, A. A. Folarin,
A. Roberts, R. Bendayan, M. P. Richardson, R. Stewart, A. D. Shah, W. K. Wong, Z. Ibrahim,
J. T. Teo, R. J. Dobson, Multi-domain clinical natural language processing with MedCAT: The
medical concept annotation toolkit, Artificial Intelligence in Medicine 117 (2021) 102083. URL:
https://www.sciencedirect.com/science/article/pii/S0933365721000762. doi:https://doi.org/
10.1016/j.artmed.2021.102083.
[32] A. E. Johnson, T. J. Pollard, L. Shen, L. wei H. Lehman, M. Feng, M. Ghassemi, B. Moody, P. Szolovits,
L. A. Celi, R. G. Mark, MIMIC-III, a freely accessible critical care database, Scientific Data 3 (2016).</p>
      <p>URL: https://doi.org/10.1038/sdata.2016.35. doi:10.1038/sdata.2016.35.
[33] T. M. Lehmann, H. Schubert, D. Keysers, M. Kohnen, B. B. Wein, The IRMA code for unique
classification of medical images, in: H. K. Huang, O. M. Ratib (Eds.), Medical Imaging 2003: PACS
and Integrated Medical Information Systems: Design and Evaluation, SPIE, 2003. doi:10.1117/
12.480677.
[34] J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, BERT: Pre-training of deep bidirectional transformers
for language understanding, in: J. Burstein, C. Doran, T. Solorio (Eds.), Proceedings of the 2019
Conference of the North American Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long and Short Papers), Association for Computational
Linguistics, Minneapolis, Minnesota, 2019, pp. 4171 – 4186. URL: https://aclanthology.org/N19-1423.
doi:10.18653/v1/N19-1423.
[35] C.-Y. Lin, ROUGE: A package for automatic evaluation of summaries, in: Text
Summarization Branches Out, Association for Computational Linguistics, 2004, pp. 74–81. URL: https:
//aclanthology.org/W04-1013.
[36] M. Denkowski, A. Lavie, Meteor universal: Language specific translation evaluation for any target
language, in: Proceedings of the Ninth Workshop on Statistical Machine Translation, Association
for Computational Linguistics, 2014, pp. 376–380. URL: http://aclweb.org/anthology/W14-3348.
doi:10.3115/v1/W14-3348.
[37] R. Vedantam, C. L. Zitnick, D. Parikh, CIDEr: Consensus-based image description evaluation, in:
2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), IEEE, 2015, pp. 4566–
4575. URL: http://ieeexplore.ieee.org/document/7299087/. doi:10.1109/CVPR.2015.7299087.
[38] T. Sellam, D. Das, A. Parikh, BLEURT: Learning robust metrics for text generation, in:
Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association
for Computational Linguistics, Online, 2020, pp. 7881–7892. URL: https://aclanthology.org/2020.
acl-main.704. doi:10.18653/v1/2020.acl-main.704.
[39] J. Hessel, A. Holtzman, M. Forbes, R. Le Bras, Y. Choi, CLIPScore: A reference-free evaluation
metric for image captioning, in: Proceedings of the 2021 Conference on Empirical Methods in
Natural Language Processing, Association for Computational Linguistics, Online and Punta Cana,
Dominican Republic, 2021, pp. 7514–7528. URL: https://aclanthology.org/2021.emnlp-main.595.
doi:10.18653/v1/2021.emnlp-main.595.
[40] A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin,
J. Clark, G. Krueger, I. Sutskever, Learning transferable visual models from natural language
supervision, in: M. Meila, T. Zhang (Eds.), Proceedings of the 38th International Conference
on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event, volume 139 of Proceedings of
Machine Learning Research, PMLR, 2021, pp. 8748–8763. URL: http://proceedings.mlr.press/v139/
radford21a.html.
[41] A. Ben Abacha, W.-w. Yim, G. Michalopoulos, T. Lin, An investigation of evaluation methods in
automatic medical note generation, in: A. Rogers, J. Boyd-Graber, N. Okazaki (Eds.), Findings of the
Association for Computational Linguistics: ACL 2023, Association for Computational Linguistics,
Toronto, Canada, 2023, pp. 2575–2588. URL: https://aclanthology.org/2023.findings-acl.161. doi: 10.
18653/v1/2023.findings-acl.161.
[42] K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of
the IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2016), 2016, pp. 770 –
778. doi:10.1109/CVPR.2016.90.
[43] M. Tan, Q. V. Le, EficientNet: Rethinking model scaling for convolutional neural networks, in:
Proceedings of the International Conference on Machine Learning (ICML 2019), 2019, pp. 6105 –
6114.
[44] G. Huang, Z. Liu, L. Van Der Maaten, K. Q. Weinberger, Densely connected convolutional networks,
in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017),
2017, pp. 2261 – 2269. doi:10.1109/CVPR.2017.243.
[45] S. Zagoruyko, N. Komodakis, Wide residual networks, in: Proceedings of the British Machine</p>
      <p>Vision Conference (BMVC 2016), 2016. doi:10.5244/c.30.87.
[46] J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, L. Fei-Fei, ImageNet: A large-scale hierarchical image
database, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition
(CVPR 2009), 2009, pp. 248 – 255. doi:10.1109/CVPR.2009.5206848.
[47] Z. Liu, H. Hu, Y. Lin, Z. Yao, Z. Xie, Y. Wei, J. Ning, Y. Cao, Z. Zhang, L. Dong, F. Wei, B. Guo,
Swin Transformer V2: Scaling up capacity and resolution, in: Proceedings of the IEEE/CVF
Conference on Computer Vision and Pattern Recognition (CVPR 2022), 2022, pp. 11999 – 12009.
doi:10.1109/CVPR52688.2022.01170.
[48] A. Dosovitskiy, L. Beyer, A. I. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani,
M. Minderer, G. Heigold, S. Gelly, J. Uszkoreit, N. Houlsby, An image is worth 16x16 words:
Transformers for image recognition at scale, in: Proceedings of the International Conference on
Learning Representations (ICLR 2021), 2021.
[49] C. Rafel, N. Shazeer, A. Roberts, K. J. Lee, S. Narang, M. Matena, Y. Zhou, W. Li, P. Liu, Exploring
the limits of transfer learning with a unified text-to-text transformer, Journal of Machine Learning
Research 21 (2020) 1 – 67.
[50] A. Trockman, J. Z. Kolter, Patches are all you need?, Transactions on Machine Learning Research
(2023). URL: https://openreview.net/forum?id=rAnB7JSMXL.
[51] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin,</p>
    </sec>
    <sec id="sec-8">
      <title>A. Full Results</title>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          [1]
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Schaer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Bromuri</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEF 2016 medical task</article-title>
          ,
          <source>in: Working Notes of CLEF 2016 (Cross Language Evaluation Forum)</source>
          ,
          <year>2016</year>
          , pp.
          <fpage>219</fpage>
          -
          <lpage>232</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          [2]
          <string-name>
            <given-names>C.</given-names>
            <surname>Eickhof</surname>
          </string-name>
          ,
          <string-name>
            <given-names>I. Schwall</given-names>
            ,
            <surname>A. G. S. de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of ImageCLEFcaption 2017 - image caption prediction and concept detection for biomedical images</article-title>
          ,
          <source>in: Working Notes of CLEF 2017 - Conference and Labs of the Evaluation Forum</source>
          , Dublin, Ireland,
          <source>September 11-14</source>
          ,
          <year>2017</year>
          .,
          <year>2017</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-1866/invited_paper_7.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          [3]
          <string-name>
            <surname>A. G. S. de Herrera</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <string-name>
            <surname>Eickhof</surname>
            ,
            <given-names>V.</given-names>
          </string-name>
          <string-name>
            <surname>Andrearczyk</surname>
            ,
            <given-names>H.</given-names>
          </string-name>
          <string-name>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEF 2018 caption prediction tasks</article-title>
          ,
          <source>in: Working Notes of CLEF 2018 - Conference and Labs of the Evaluation Forum</source>
          , Avignon, France,
          <source>September 10-14</source>
          ,
          <year>2018</year>
          .,
          <year>2018</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-
          <volume>2125</volume>
          /invited_ paper_4.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          [4]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <surname>A. G. S. de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of the ImageCLEFmed 2019 concept detection task</article-title>
          , in: L.
          <string-name>
            <surname>Cappellato</surname>
            ,
            <given-names>N.</given-names>
          </string-name>
          <string-name>
            <surname>Ferro</surname>
            ,
            <given-names>D. E.</given-names>
          </string-name>
          <string-name>
            <surname>Losada</surname>
          </string-name>
          , H. Müller (Eds.),
          <source>Working Notes of CLEF 2019 - Conference and Labs of the Evaluation Forum, Lugano, Switzerland, September</source>
          <volume>9</volume>
          -
          <issue>12</issue>
          ,
          <year>2019</year>
          , volume
          <volume>2380</volume>
          <source>of CEUR Workshop Proceedings, CEUR-WS.org</source>
          ,
          <year>2019</year>
          . URL: http://ceur-ws.
          <source>org/</source>
          Vol-
          <volume>2380</volume>
          /paper_245.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          [5]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , H. Müller,
          <article-title>Overview of the ImageCLEFmed 2020 concept prediction task: Medical image understanding</article-title>
          ,
          <source>in: CLEF2020 Working Notes</source>
          , volume
          <volume>1166</volume>
          <source>of CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Thessaloniki, Greece,
          <year>2020</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          [6]
          <string-name>
            <given-names>O.</given-names>
            <surname>Bodenreider</surname>
          </string-name>
          ,
          <article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title>
          ,
          <source>Nucleic Acids Research</source>
          <volume>32</volume>
          (
          <year>2004</year>
          )
          <fpage>267</fpage>
          -
          <lpage>270</lpage>
          . doi:
          <volume>10</volume>
          .1093/nar/gkh061.
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          [7]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Jacutprakart</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <article-title>Overview of the ImageCLEFmed 2021 concept &amp; caption prediction task</article-title>
          ,
          <source>in: CLEF2021 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Bucharest, Romania,
          <year>2021</year>
          , pp.
          <fpage>1101</fpage>
          -
          <lpage>1112</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          [8]
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , L. Bloch,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <surname>C.</surname>
          </string-name>
          <article-title>M. Friedrich, Overview of ImageCLEFmedical 2022 - caption prediction and concept detection</article-title>
          ,
          <source>in: CLEF2022 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Bologna, Italy,
          <year>2022</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          [9]
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Koitka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Nensa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <article-title>Radiology Objects in COntext (ROCO): a multimodal image dataset, in: Intravascular Imaging and Computer Assisted Stenting -</article-title>
          and
          <string-name>
            <surname>-</surname>
          </string-name>
          LargeScale
          <source>Annotation of Biomedical Data and Expert Label Synthesis - 7th Joint International Workshop</source>
          , CVII-STENT 2018 and Third International Workshop, LABELS 2018,
          <article-title>Held in Conjunction with MICCAI 2018, Granada</article-title>
          , Spain,
          <year>September 16</year>
          ,
          <year>2018</year>
          , Proceedings,
          <year>2018</year>
          , pp.
          <fpage>180</fpage>
          -
          <lpage>189</lpage>
          . doi:
          <volume>10</volume>
          .1007/ 978-3-
          <fpage>030</fpage>
          -01364-6\_
          <fpage>20</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          [10]
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. G.</given-names>
            <surname>Seco de Herrera</surname>
          </string-name>
          , L. Bloch,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <surname>C.</surname>
          </string-name>
          <article-title>M. Friedrich, Overview of ImageCLEFmedical 2023 - caption prediction and concept detection</article-title>
          ,
          <source>in: CLEF2023 Working Notes</source>
          , volume
          <volume>3497</volume>
          <source>of CEUR Workshop Proceedings</source>
          , CEURWS.org, Thessaloniki, Greece,
          <year>2023</year>
          , pp.
          <fpage>1328</fpage>
          -
          <lpage>1346</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          [11]
          <string-name>
            <given-names>K.</given-names>
            <surname>Papineni</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Roukos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Ward</surname>
          </string-name>
          , W.-J. Zhu,
          <article-title>BLEU: a method for automatic evaluation of machine translation</article-title>
          ,
          <source>in: Proceedings of the 40th annual meeting of the Association for Computational Linguistics</source>
          ,
          <year>2002</year>
          , pp.
          <fpage>311</fpage>
          -
          <lpage>318</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref12">
        <mixed-citation>
          [12]
          <string-name>
            <given-names>T.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Kishore</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K. Q.</given-names>
            <surname>Weinberger</surname>
          </string-name>
          ,
          <string-name>
            <surname>Y. Artzi,</surname>
          </string-name>
          <article-title>BERTScore: Evaluating text generation with BERT</article-title>
          ,
          <source>in: 8th International Conference on Learning Representations, ICLR</source>
          <year>2020</year>
          ,
          <string-name>
            <given-names>Addis</given-names>
            <surname>Ababa</surname>
          </string-name>
          , Ethiopia,
          <source>April 26-30</source>
          ,
          <year>2020</year>
          ,
          <year>2020</year>
          . URL: https://openreview.net/forum?id=SkeHuCVFDr.
        </mixed-citation>
      </ref>
      <ref id="ref13">
        <mixed-citation>
          [13]
          <string-name>
            <given-names>B.</given-names>
            <surname>Ionescu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Müller</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Drăgulinescu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. Ben</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>García Seco de Herrera</surname>
          </string-name>
          , L. Bloch,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. S.</given-names>
            <surname>Schmidt</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. M. G.</given-names>
            <surname>Pakull</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Damm</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Bracke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. M.</given-names>
            <surname>Friedrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Andrei</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Prokopchuk</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Karpenka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Radzhabov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Kovalev</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Macaire</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Schwab</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Lecouteux</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Esperança-Rodier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Yim</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Fu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Sun</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Yetisgen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Xia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S. A.</given-names>
            <surname>Hicks</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. A.</given-names>
            <surname>Riegler</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Thambawita</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Storås</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Halvorsen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Heinrich</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Kiesel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Potthast</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Stein</surname>
          </string-name>
          , Overview of ImageCLEF 2024:
          <article-title>Multimedia retrieval in medical applications, in: Experimental IR Meets Multilinguality</article-title>
          , Multimodality, and
          <string-name>
            <surname>Interaction</surname>
          </string-name>
          ,
          <source>Proceedings of the 15th International Conference of the CLEF Association (CLEF</source>
          <year>2024</year>
          ), Springer Lecture Notes in Computer Science LNCS, Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref14">
        <mixed-citation>
          [14]
          <string-name>
            <given-names>J.</given-names>
            <surname>Rückert</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Bloch</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Brüngel</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Idrissi-Yaghir</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Schäfer</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C. S.</given-names>
            <surname>Schmidt</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Koitka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>O.</given-names>
            <surname>Pelka</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. B.</given-names>
            <surname>Abacha</surname>
          </string-name>
          ,
          <string-name>
            <surname>A. G. S. de Herrera</surname>
            , H. Müller,
            <given-names>P. A.</given-names>
          </string-name>
          <string-name>
            <surname>Horn</surname>
            ,
            <given-names>F.</given-names>
          </string-name>
          <string-name>
            <surname>Nensa</surname>
            ,
            <given-names>C.</given-names>
          </string-name>
          <article-title>M. Friedrich, ROCOv2: Radiology Objects in COntext version 2, an updated multimodal image dataset, Scientific Data (</article-title>
          <year>2024</year>
          ). URL: https://arxiv.org/abs/2405.10004v1.
          <source>doi:10.1038/s41597-024-03496-6.</source>
        </mixed-citation>
      </ref>
      <ref id="ref15">
        <mixed-citation>
          [15]
          <string-name>
            <given-names>R. J.</given-names>
            <surname>Roberts</surname>
          </string-name>
          , PubMed Central:
          <article-title>The GenBank of the published literature</article-title>
          ,
          <source>Proceedings of the National Academy of Sciences of the United States of America</source>
          <volume>98</volume>
          (
          <year>2001</year>
          )
          <fpage>381</fpage>
          -
          <lpage>382</lpage>
          . doi:
          <volume>10</volume>
          .1073/ pnas.98.2.381.
        </mixed-citation>
      </ref>
      <ref id="ref16">
        <mixed-citation>
          [16]
          <string-name>
            <given-names>L. S.</given-names>
            <surname>Shapley</surname>
          </string-name>
          , et al.,
          <article-title>A value for n-person games (</article-title>
          <year>1953</year>
          ).
        </mixed-citation>
      </ref>
      <ref id="ref17">
        <mixed-citation>
          [17]
          <string-name>
            <given-names>S. M.</given-names>
            <surname>Lundberg</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.-I.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <article-title>A unified approach to interpreting model predictions</article-title>
          ,
          <source>in: Neural Information Processing Systems</source>
          , volume
          <volume>30</volume>
          ,
          <year>2017</year>
          , pp.
          <fpage>4768</fpage>
          -
          <lpage>4777</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref18">
        <mixed-citation>
          [18]
          <string-name>
            <given-names>M.</given-names>
            <surname>Samprovalaki</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Chatzipapadopoulou</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Moschovis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Charalampakos</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Kaliosis</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Pavlopoulos</surname>
          </string-name>
          ,
          <string-name>
            <surname>I. Androutsopoulos</surname>
          </string-name>
          , AUEB NLP group at
          <source>ImageCLEFmedical</source>
          <year>2024</year>
          , in: CLEF2024 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref19">
        <mixed-citation>
          [19]
          <string-name>
            <given-names>H.</given-names>
            <surname>Kauschke</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Bogomasov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Conrad</surname>
          </string-name>
          ,
          <article-title>Predicting captions and detecting concepts for medical images: Contributions of the DBS-HHU team to ImageCLEFmedical caption 2024</article-title>
          , in: CLEF2024 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref20">
        <mixed-citation>
          [20]
          <string-name>
            <given-names>N. N.</given-names>
            <surname>Nguyen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H. L.</given-names>
            <surname>Tu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P. D.</given-names>
            <surname>Nguyen</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. N.</given-names>
            <surname>Do</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T. M.</given-names>
            <surname>Thai</surname>
          </string-name>
          , T. B.
          <string-name>
            <surname>Nguyen-Tat</surname>
            ,
            <given-names>DS</given-names>
          </string-name>
          @BioMed at ImageCLEFmedical caption 2024:
          <article-title>Enhanced attention mechanisms in medical caption generation through concept detection integration</article-title>
          ,
          <source>in: CLEF2024 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref21">
        <mixed-citation>
          [21]
          <string-name>
            <given-names>R.</given-names>
            <surname>Dhinagaran</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S. S. N.</given-names>
            <surname>Mohamed</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Srinivasan</surname>
          </string-name>
          , SSNMLRGKSR at ImageCLEFmedical caption 2024:
          <article-title>Medical concept detection using DenseNet-121 with MultiLabelBinarizer</article-title>
          , in: CLEF2024 Working Notes, CEUR Workshop Proceedings, CEUR-WS.org, Grenoble, France,
          <year>2024</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref22">
        <mixed-citation>
          [22]
          <string-name>
            <given-names>M.</given-names>
            <surname>Hoque</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. R.</given-names>
            <surname>Hasan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. I. S.</given-names>
            <surname>Emon</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Khalifa</surname>
          </string-name>
          ,
          <string-name>
            <surname>M. M. Rahman</surname>
          </string-name>
          ,
          <article-title>Medical image interpretation with large multimodal models</article-title>
          ,
          <source>in: CLEF2024 Working Notes, CEUR Workshop Proceedings</source>
          ,
          <article-title>Attention is all you need</article-title>
          , in: I. Guyon, U. von Luxburg, S. Bengio,
          <string-name>
            <given-names>H. M.</given-names>
            <surname>Wallach</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Fergus</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S. V. N.</given-names>
            <surname>Vishwanathan</surname>
          </string-name>
          , R. Garnett (Eds.),
          <source>Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9</source>
          ,
          <year>2017</year>
          , Long Beach, CA, USA,
          <year>2017</year>
          , pp.
          <fpage>5998</fpage>
          -
          <lpage>6008</lpage>
          . URL: https://proceedings.neurips.cc/paper/2017/hash/ 3f5ee243547dee91fbd053c1c4a845aa-Abstract.html.
        </mixed-citation>
      </ref>
      <ref id="ref23">
        <mixed-citation>
          [52]
          <string-name>
            <given-names>Y.</given-names>
            <surname>Fang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Xie</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Sun</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Huang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <surname>Y. Cao,</surname>
          </string-name>
          <article-title>EVA: Exploring the limits of masked visual representation learning at scale</article-title>
          ,
          <source>in: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR</source>
          <year>2023</year>
          ),
          <year>2023</year>
          , pp.
          <fpage>19358</fpage>
          -
          <lpage>19369</lpage>
          . doi:
          <volume>10</volume>
          .1109/CVPR52729.
          <year>2023</year>
          .
          <year>01855</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref24">
        <mixed-citation>
          [53]
          <string-name>
            <given-names>S.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Xu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Usuyama</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Xu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Bagga</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Tinn</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Preston</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Rao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Wei</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Valluri</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Wong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Tupini</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mazzola</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Shukla</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Liden</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Gao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M. P.</given-names>
            <surname>Lungren</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Naumann</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Wang</surname>
          </string-name>
          , H. Poon,
          <article-title>BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs</article-title>
          ,
          <year>2024</year>
          . arXiv:
          <volume>2303</volume>
          .
          <year>00915v2</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref25">
        <mixed-citation>
          [54]
          <string-name>
            <given-names>W.</given-names>
            <surname>Zeng</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Ren</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Su</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Liao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Jiang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Gong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Yao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Huang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Yu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Guo</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Yu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Tao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Yan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Peng</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Jiang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Zhang</surname>
          </string-name>
          , L. Deng,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Lin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Zhang</surname>
          </string-name>
          , S. Zhang,
          <string-name>
            <given-names>M.</given-names>
            <surname>Guo</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Gu</surname>
          </string-name>
          , G. Fan,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Jin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Liu</surname>
          </string-name>
          , Y. Tian, PanGu- :
          <article-title>Large-scale autoregressive pretrained chinese language models with auto-parallel computation</article-title>
          ,
          <year>2021</year>
          . arXiv:
          <volume>2104</volume>
          .
          <year>12369v1</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref26">
        <mixed-citation>
          [55]
          <string-name>
            <given-names>H.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Q.</given-names>
            <surname>Wu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y. J.</given-names>
            <surname>Lee</surname>
          </string-name>
          ,
          <article-title>Visual instruction tuning</article-title>
          ,
          <source>in: Thirty-seventh Conference on Neural Information Processing Systems</source>
          ,
          <year>2023</year>
          . URL: https://openreview.net/forum?id=w0H2xGHlkw.
        </mixed-citation>
      </ref>
      <ref id="ref27">
        <mixed-citation>
          [56]
          <string-name>
            <given-names>H.</given-names>
            <surname>Laurençon</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Saulnier</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Tronchon</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Bekman</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Singh</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Lozhkov</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Karamcheti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Rush</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Kiela</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Cord</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Sanh</surname>
          </string-name>
          ,
          <string-name>
            <surname>OBELICS:</surname>
          </string-name>
          <article-title>An open web-scale filtered dataset of interleaved image-text documents</article-title>
          , in: A.
          <string-name>
            <surname>Oh</surname>
            ,
            <given-names>T.</given-names>
          </string-name>
          <string-name>
            <surname>Naumann</surname>
            ,
            <given-names>A.</given-names>
          </string-name>
          <string-name>
            <surname>Globerson</surname>
            ,
            <given-names>K.</given-names>
          </string-name>
          <string-name>
            <surname>Saenko</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Hardt</surname>
          </string-name>
          , S. Levine (Eds.),
          <source>Advances in Neural Information Processing Systems</source>
          , volume
          <volume>36</volume>
          ,
          <string-name>
            <surname>Curran</surname>
            <given-names>Associates</given-names>
          </string-name>
          , Inc.,
          <year>2023</year>
          , pp.
          <fpage>71683</fpage>
          -
          <lpage>71702</lpage>
          . URL: https://proceedings.neurips.cc/paper_files/paper/2023/ ifle/e2cfb719f58585f779d0a4f9f07bd618-Paper-Datasets_and_Benchmarks.pdf.
        </mixed-citation>
      </ref>
      <ref id="ref28">
        <mixed-citation>
          [57]
          <string-name>
            <given-names>P.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Hu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Choi</surname>
          </string-name>
          ,
          <string-name>
            <surname>J. Gao,</surname>
          </string-name>
          <article-title>VinVL: Revisiting visual representations in vision-language models</article-title>
          ,
          <source>in: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR</source>
          <year>2021</year>
          ),
          <year>2021</year>
          , pp.
          <fpage>5575</fpage>
          -
          <lpage>5584</lpage>
          . doi:
          <volume>10</volume>
          .1109/ CVPR46437.
          <year>2021</year>
          .
          <volume>00553</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref29">
        <mixed-citation>
          [58]
          <string-name>
            <given-names>Q.</given-names>
            <surname>Lu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Dou</surname>
          </string-name>
          , T. Nguyen,
          <article-title>ClinicalT5: A generative language model for clinical text</article-title>
          , in: Y.
          <string-name>
            <surname>Goldberg</surname>
            ,
            <given-names>Z.</given-names>
          </string-name>
          <string-name>
            <surname>Kozareva</surname>
          </string-name>
          , Y. Zhang (Eds.),
          <source>Findings of the Association for Computational Linguistics: EMNLP</source>
          <year>2022</year>
          ,
          <article-title>Association for Computational Linguistics</article-title>
          , Abu Dhabi, United Arab Emirates,
          <year>2022</year>
          , pp.
          <fpage>5436</fpage>
          -
          <lpage>5443</lpage>
          . doi:
          <volume>10</volume>
          .18653/v1/
          <year>2022</year>
          .findings-emnlp.
          <volume>398</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref30">
        <mixed-citation>
          [59]
          <string-name>
            <given-names>H.</given-names>
            <surname>Yuan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yuan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Gan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Zhang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Y.</given-names>
            <surname>Xie</surname>
          </string-name>
          ,
          <string-name>
            <surname>S. Yu,</surname>
          </string-name>
          <article-title>BioBART: Pretraining and evaluation of a biomedical generative language model</article-title>
          , in: D.
          <string-name>
            <surname>Demner-Fushman</surname>
            ,
            <given-names>K. B.</given-names>
          </string-name>
          <string-name>
            <surname>Cohen</surname>
            ,
            <given-names>S.</given-names>
          </string-name>
          <string-name>
            <surname>Ananiadou</surname>
          </string-name>
          , J. Tsujii (Eds.),
          <source>Proceedings of the 21st Workshop on Biomedical Language Processing (BioNLP</source>
          <year>2022</year>
          ),
          <article-title>Association for Computational Linguistics</article-title>
          , Dublin, Ireland,
          <year>2022</year>
          , pp.
          <fpage>97</fpage>
          -
          <lpage>109</lpage>
          . doi:
          <volume>10</volume>
          .18653/ v1/
          <year>2022</year>
          .bionlp-
          <volume>1</volume>
          .9.
        </mixed-citation>
      </ref>
      <ref id="ref31">
        <mixed-citation>
          [60]
          <string-name>
            <given-names>W.</given-names>
            <surname>Dai</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <surname>D. LI</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Tiong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Zhao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>W.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P. N.</given-names>
            <surname>Fung</surname>
          </string-name>
          , S. Hoi, InstructBLIP: Towards general
          <article-title>-purpose vision-language models with instruction tuning</article-title>
          , in: A.
          <string-name>
            <surname>Oh</surname>
            ,
            <given-names>T.</given-names>
          </string-name>
          <string-name>
            <surname>Naumann</surname>
            ,
            <given-names>A.</given-names>
          </string-name>
          <string-name>
            <surname>Globerson</surname>
            ,
            <given-names>K.</given-names>
          </string-name>
          <string-name>
            <surname>Saenko</surname>
            ,
            <given-names>M.</given-names>
          </string-name>
          <string-name>
            <surname>Hardt</surname>
          </string-name>
          , S. Levine (Eds.),
          <source>Advances in Neural Information Processing Systems</source>
          , volume
          <volume>36</volume>
          ,
          <string-name>
            <surname>Curran</surname>
            <given-names>Associates</given-names>
          </string-name>
          , Inc.,
          <year>2023</year>
          , pp.
          <fpage>49250</fpage>
          -
          <lpage>49267</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref32">
        <mixed-citation>
          [61]
          <string-name>
            <given-names>J.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Xiong</surname>
          </string-name>
          , S. Hoi, BLIP:
          <article-title>Bootstrapping language-image pre-training for unified visionlanguage understanding and generation</article-title>
          ,
          <source>in: International Conference on Machine Learning</source>
          ,
          <year>2022</year>
          , pp.
          <fpage>12888</fpage>
          -
          <lpage>12900</lpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref33">
        <mixed-citation>
          [62]
          <string-name>
            <given-names>K.</given-names>
            <surname>Simonyan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Zisserman</surname>
          </string-name>
          ,
          <article-title>Very deep convolutional networks for large-scale image recognition</article-title>
          ,
          <source>Proceedings of the International Conference on Learning Representations (ICLR</source>
          <year>2014</year>
          )
          <article-title>(</article-title>
          <year>2014</year>
          ).
        </mixed-citation>
      </ref>
      <ref id="ref34">
        <mixed-citation>
          [63]
          <string-name>
            <given-names>S.</given-names>
            <surname>Hochreiter</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Schmidhuber</surname>
          </string-name>
          ,
          <article-title>Long short-term memory</article-title>
          ,
          <source>Neural Comput. 9</source>
          (
          <year>1997</year>
          )
          <fpage>1735</fpage>
          -
          <lpage>1780</lpage>
          . URL: https://doi.org/10.1162/neco.
          <year>1997</year>
          .
          <volume>9</volume>
          .8.1735. doi:
          <volume>10</volume>
          .1162/neco.
          <year>1997</year>
          .
          <volume>9</volume>
          .8.1735.
        </mixed-citation>
      </ref>
      <ref id="ref35">
        <mixed-citation>
          [64]
          <string-name>
            <given-names>H.</given-names>
            <surname>Bao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Dong</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Piao</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Wei</surname>
          </string-name>
          ,
          <article-title>BEit: BERT pre-training of image transformers</article-title>
          ,
          <source>in: Proceedings of the International Conference on Learning Representations (ICLR</source>
          <year>2022</year>
          ),
          <year>2022</year>
          . URL: https: //openreview.net/forum?id=p-
          <fpage>BhZSz59o4</fpage>
          .
        </mixed-citation>
      </ref>
      <ref id="ref36">
        <mixed-citation>
          [65]
          <string-name>
            <given-names>J.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Yang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Hu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Li</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Lin</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Gan</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>C.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.</given-names>
            <surname>Wang</surname>
          </string-name>
          ,
          <string-name>
            <surname>GIT:</surname>
          </string-name>
          <article-title>A generative imageto-text transformer for vision and language</article-title>
          ,
          <source>Transactions on Machine Learning Research</source>
          <year>2022</year>
          (
          <year>2022</year>
          ).
        </mixed-citation>
      </ref>
      <ref id="ref37">
        <mixed-citation>
          [66]
          <string-name>
            <given-names>S.</given-names>
            <surname>Xie</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Girshick</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Dollár</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Z.</given-names>
            <surname>Tu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>He</surname>
          </string-name>
          ,
          <article-title>Aggregated residual transformations for deep neural networks</article-title>
          ,
          <source>in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR</source>
          <year>2017</year>
          ),
          <year>2017</year>
          , pp.
          <fpage>5987</fpage>
          -
          <lpage>5995</lpage>
          . doi:
          <volume>10</volume>
          .1109/CVPR.
          <year>2017</year>
          .
          <volume>634</volume>
          .
        </mixed-citation>
      </ref>
      <ref id="ref38">
        <mixed-citation>
          [67]
          <string-name>
            <given-names>A.</given-names>
            <surname>Nicolson</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Dowling</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Koopman</surname>
          </string-name>
          ,
          <article-title>A concise model for medical image captioning</article-title>
          ,
          <source>in: CLEF2023 Working Notes, CEUR Workshop Proceedings</source>
          , CEUR-WS.org, Thessaloniki, Greece,
          <year>2023</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>