<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">AUEB NLP Group at ImageCLEFmedical Caption 2024</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Marina</forename><surname>Samprovalaki</surname></persName>
							<email>samprovalaki@aueb.gr</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Anna</forename><surname>Chatzipapadopoulou</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Georgios</forename><surname>Moschovis</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Archimedes Unit</orgName>
								<orgName type="institution">Athena Research Center</orgName>
								<address>
									<addrLine>1, Artemidos Street</addrLine>
									<postCode>GR-151 25</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Foivos</forename><surname>Charalampakos</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Panagiotis</forename><surname>Kaliosis</surname></persName>
							<email>pkaliosis@aueb.gr</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">John</forename><surname>Pavlopoulos</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Archimedes Unit</orgName>
								<orgName type="institution">Athena Research Center</orgName>
								<address>
									<addrLine>1, Artemidos Street</addrLine>
									<postCode>GR-151 25</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Ion</forename><surname>Androutsopoulos</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Informatics</orgName>
								<orgName type="institution">Athens University of Economics and Business</orgName>
								<address>
									<addrLine>76, Patission Street</addrLine>
									<postCode>GR-104 34</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Archimedes Unit</orgName>
								<orgName type="institution">Athena Research Center</orgName>
								<address>
									<addrLine>1, Artemidos Street</addrLine>
									<postCode>GR-151 25</postCode>
									<settlement>Athens</settlement>
									<country key="GR">Greece</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">AUEB NLP Group at ImageCLEFmedical Caption 2024</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">0C71B9A24230E159FE03D231073B8B48</idno>
					<idno type="arXiv">arXiv:2312.10997.</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T18:03+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Natural Language Processing</term>
					<term>Computer Vision</term>
					<term>Biomedical Images</term>
					<term>Convolutional Neural Networks</term>
					<term>Multi-Label Classification</term>
					<term>Caption Generation</term>
					<term>Generative Models</term>
					<term>Transformers</term>
					<term>Deep Learning</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>This article describes the approaches that the AUEB NLP Group experimented with during its participation in the 8 th edition of the ImageCLEFmedical Caption evaluation campaign, including both Concept Detection and Caption Prediction tasks. The objective of Concept Detection is to automatically categorize biomedical images into a set of one or more concepts. In contrast, the Caption Prediction task focuses on generating a precise and meaningful diagnostic caption that describes the medical conditions depicted in the image. Building on our prior research for the Concept Detection task, we utilized a diverse set of Convolutional Neural Network (CNN) encoders, followed by a Feed-Forward Neural Network. Additionally, we implemented two versions of the retrieval-based 𝑘-NN algorithm: a version that assigned concepts based on statistical frequency and a weighted version that took into account the order of the retrieved neighbors. Both models used the CNN image encoders to improve their retrieval capabilities. Regarding the Caption Prediction task, we fine-tuned the InstructBLIP model to generate initial captions and then enhanced it by employing rephrasing techniques with further pre-trained models. We also used synthesizing techniques that incorporated information from similar neighboring images in the training set to refine these captions. Additionally, we employed "Distance from Median Maximum Concept Similarity" (DMMCS), a novel guided-decoding approach that drives the model's behaviour throughout the decoding process, aiming to integrate information from the predicted concepts of Concept Detection. We explored the application of DMMCS to all of our developed systems. Our group ranked 2 nd in Concept Detection and 4 th in Caption Prediction.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>ImageCLEF <ref type="bibr" target="#b0">[1]</ref> is an ongoing evaluation initiative, first run in 2003 as part of the Cross Language Evaluation Forum (CLEF) 1 , that promotes the evaluation of technologies for annotation, indexing, classification, and retrieval of multi-modal data. ImageCLEFmedical is one of the four main tasks in this year's ImageCLEF campaign. We participated in the ImageCLEFmedical Caption task, which was organized for the eigth time <ref type="bibr" target="#b1">[2]</ref>. As in previous years, the task comprised two sub-tasks: Concept Detection and Caption Prediction.</p><p>The objective of Concept Detection is to accurately associate a biomedical image with one or more relevant medical concepts (tags), while in Caption Prediction, the goal is to automatically generate a preliminary diagnostic report that accurately describes the medical findings, as well as the anatomy of the body structures and organs shown in the image. Diagnostic Captioning remains a challenging research problem aimed at assisting the diagnostic process for patients by providing a preliminary report, rather than replacing medical professionals involved in the procedure <ref type="bibr" target="#b2">[3]</ref>. It can thus be seen as an assistive tool, capable of producing an initial draft diagnosis regarding the patient's condition. Such a document would ideally allow doctors to focus on critical areas of the image <ref type="bibr" target="#b3">[4]</ref> and help them produce more precise medical diagnoses at an increased speed <ref type="bibr" target="#b4">[5]</ref>. Experienced clinicians could enhance their throughput by analyzing the large volume of daily medical examinations more quickly and efficiently. Less experienced clinicians could consider the automatically generated captions to reduce the likelihood of clinical errors <ref type="bibr" target="#b5">[6]</ref>. Concept Detection can further improve Diagnostic Captioning by identifying key concepts that should be included in the draft report. We demonstrate the connection between the two sub-tasks by using "Distance from Median Maximum Concept Similarity" (DMMCS)<ref type="foot" target="#foot_0">2</ref>  <ref type="bibr" target="#b6">[7]</ref>, which employs information derived from our Concept Detection systems in order to improve the performance of our Caption Prediction systems.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.1.">AUEB NLP Group contributions</head><p>In this work, we present the experiments conducted and the systems submitted as part of the AUEB NLP Group's participation in this year's Concept Detection and Caption Prediction tasks. We used a number of new approaches influenced by the remarkable progress in the field of NLP and based on instruction-tuned Large Language Models (LLMs) <ref type="bibr" target="#b7">[8]</ref>.</p><p>Our submissions to the Concept Detection sub-task are based on two distinct approaches. We used a Convolutional Neural Network (CNN) encoder to extract visual features from the medical images. In the first approach, these features were fed into a Feed-Forward Neural Network (FFNN) to classify the images into various medical concepts. In the second approach, we implemented a separate method using a 𝑘-nearest neighbors (𝑘-NN) algorithm. In this approach, 𝑘 neighbors are first retrieved, and the most frequently occurring concepts among these neighbors are selected.</p><p>Regarding the Caption Prediction sub-task,we tried five main approaches. First, we employed an InstructBLIP model <ref type="bibr" target="#b8">[9]</ref> that was fine-tuned on the specified dataset <ref type="bibr" target="#b9">[10]</ref> to generate an initial set of captions, which were then also used in the other four approaches. In the second approach, we enhanced the initial captions by drawing insights from captions of similar images and training a FLAN-T5 model <ref type="bibr" target="#b10">[11]</ref> to refine them <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13]</ref>. The third approach was similar, but instead of FLAN-T5, we employed ClinicalT5 <ref type="bibr" target="#b13">[14]</ref>, which is pre-trained on numerous medical datasets, in order to rephrase and correct the initial captions produced by InstructBLIP. The fourth approach involved integrating the DMMCS algorithm <ref type="bibr" target="#b6">[7]</ref> in the language model's decoding process in order to promote the inclusion of a given set of keywords, which in this case where predicted by one of our Concept Detection systems. Lastly, we also applied DMMCS decoding to ClinicalT5 in order to maximize their efficacy and improve the overall caption quality. In all our models we used CNN encoders, since there are signs that vision transformers <ref type="bibr" target="#b14">[15]</ref> still have inferior performance in visual tasks, such as classification and semantic segmentation <ref type="bibr" target="#b15">[16]</ref>, especially in medical image tagging <ref type="bibr" target="#b4">[5,</ref><ref type="bibr" target="#b16">17]</ref>.</p><p>Extending our history of successful entries <ref type="bibr" target="#b17">[18,</ref><ref type="bibr" target="#b18">19,</ref><ref type="bibr" target="#b19">20,</ref><ref type="bibr" target="#b20">21,</ref><ref type="bibr" target="#b21">22]</ref> in the ImageCLEFmedical campaign, our submissions ranked 2 nd among 9 participating groups in the Concept Detection sub-task and 4 th among 11 participating groups in the Caption Prediction sub-task. In Section 2, we provide insight into this year's dataset, followed by a discussion of our approaches in Section 3. In Section 4, we present our experimental results for each sub-task. Finally, in Section 5, we summarize our findings and suggest directions for future research.</p><p>All code used for our experiments is available on GitHub.<ref type="foot" target="#foot_1">3</ref> </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Data</head><p>In this year's edition of the ImageCLEFmedical Caption task, the dataset is an updated and extended version of the Radiology Objects in Context (ROCO) dataset <ref type="bibr" target="#b9">[10]</ref>, which originates from biomedical articles of the PubMed Open Access (PMC OA) subset. <ref type="foot" target="#foot_2">4</ref> . This dataset, which is common for both sub-tasks, consists of 80,080 biomedical images along with their respective medical concepts, in the form of UMLS <ref type="bibr" target="#b22">[23]</ref> terms <ref type="foot" target="#foot_3">5</ref> , and diagnostic captions. The dataset was originally split by the organizers into training and validation subsets, with 70,108 radiology images in the first set and 9,972 in the latter. After merging the provided data, we split them again, this time into three subsets, in order to also obtain a development (private test) subset for evaluation purposes. We used a 75%-10%-15% training-validation-development split, keeping relatively equal concept distributions in all three subsets. Consequently, we obtained 64,928 images as our training data, 7,179 images as our validation set, while the remaining 7,973 images constituted our held-out development set. All of our submissions were also evaluated on the hidden official test set (ROCOv2) <ref type="bibr" target="#b23">[24]</ref>. The test dataset utilizes Radiology Objects in COntext Version 2 (ROCOv2) <ref type="bibr" target="#b23">[24]</ref>, an updated and extended version of the ROCO dataset <ref type="bibr" target="#b9">[10]</ref>. This set includes 17,237 previously unseen images.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Concept Detection</head><p>Concept Detection is a multi-label classification problem covering a broad range of 1,945 distinct biomedical concepts, originating from the Unified Medical Language System (UMLS) <ref type="bibr" target="#b22">[23]</ref>. In this sub-task, the goal is to identify (assign) the distinct medical concepts (tags) depicted in each image (e.g., particular medical conditions). Among the available concepts (tag set), four are specific imaging modalities: X-Ray Computed Tomography, Ultrasonography, Magnetic Resonance Imaging (MRI), PET/CT scans. All concepts are represented by Concept Unique Identifiers (CUIs) following the UMLS standard. Some examples of images and their ground truth concepts can be found in Figure <ref type="figure" target="#fig_1">1</ref>.  The distribution of concepts is highly skewed. Some concepts are present in more than 25, 000 images, whereas others are associated with only 1 image. Figure <ref type="figure" target="#fig_2">2</ref>(a) depicts the long-tail distribution of the entire (development + validation + train) dataset, as shown in the left plot, where the frequencies of the concepts (number of images each concept is associated with) are plotted in descending order against their respective class indices. After conducting a comprehensive exploratory analysis of this year's dataset, we found that certain concepts were more prevalent (Table <ref type="table">1</ref>); these mostly correspond to kinds of medical examinations, such as X-Ray Computed Tomography or Plain x-ray. Most images are associated (in the ground truth) with at least one of these overarching concepts, alongside more specialized ones. The maximum and minimum number of concepts assigned to a single image are 27 and 1, occurring in 1 and 8,567 images respectively. The average number of assigned concepts per image is 3.1583. The aforementioned observations are outlined in the histogram in Figure <ref type="figure" target="#fig_2">2(b)</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>The ten most frequent concepts (CUIs) of the ImageCLEFmedical2024 dataset, along with their corresponding UMLS terms, and the number of images they are associated with.  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Most Common Concepts</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Caption Prediction</head><p>In the Caption Prediction data, each image is accompanied by a gold diagnostic caption that describes the medical conditions present in the image. There are 80, 080 gold captions across the whole dataset, one for each provided image. Similar to last year's campaign, the vast majority of the captions, specifically 99.47% (79, 658 out of 80, 080 captions), are unique. The maximum number of words in a single caption is 848 (occurred once), while the minimum is 1 (encountered 73 times). The average caption length is 21.01 words. These statistics apply to the dataset as a whole, but we have carefully checked that they remain consistent in all three subsets (training, validation, development) we formed. The five most common captions, as well as the ten most popular words, excluding the stopwords, can be found in Tables <ref type="table">2 and 3</ref>, respectively. In Figure <ref type="figure" target="#fig_3">3</ref>, we provide a histogram alongside a box plot, utilizing a logarithmic scale in our visualizations. This helps make smaller counts more visible and reduces the dominance of larger values, giving a more balanced view of how the data is distributed. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 2</head><p>The five most common gold captions found in the ImageCLEFmedical2024 dataset <ref type="bibr" target="#b9">[10]</ref> alongside the number of images they are associated with.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Most common captions</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Rank Caption</head><p>Occurrences</p><formula xml:id="formula_0">1 Initial panoramic radiograph. 40 2</formula><p>Final panoramic radiograph. 37 3</p><p>Chest X-ray. 20 4</p><p>Chest radiograph. 17 5</p><p>Preoperative CT scan. 9</p><p>According to the organizers, each caption is pre-processed before evaluated in the following manner:</p><p>• The caption is converted to lower-case.</p><p>• Numbers are replaced by words, e.g., number 10 becomes "ten".</p><p>• Punctuation is removed.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Methods</head><p>In this section, we present the methods we used in our submissions for both the Concept Detection and the Caption Prediction sub-tasks.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Concept Detection</head><p>Our submissions for this year's Concept Detection sub-task are built upon two frameworks. Initially, we extensively explored a CNN+FFNN framework, building upon our prior research <ref type="bibr" target="#b17">[18,</ref><ref type="bibr" target="#b18">19,</ref><ref type="bibr" target="#b19">20,</ref><ref type="bibr" target="#b20">21]</ref>, experimenting with various image encoders. Additionally, we used a neural image retrieval approach by integrating a 𝑘-nearest neighbors (𝑘-NN) algorithm, which selects 𝑘 neighbors and aggregates tags based on their frequency among the neighbors. Furthermore, we submitted several ensembles of the aforementioned systems. The ensembles employed strategies such as union-based and intersectionbased aggregation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 3</head><p>The ten most common words (of gold captions) and their frequencies in the ImageCLEFmedical2024 dataset <ref type="bibr" target="#b9">[10]</ref>, after removing stop-words. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Most common words (excluding stop-words)</head><note type="other">Word</note></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.1.">CNN + FFNN</head><p>This system employs a CNN encoder as its backbone, followed by an FFNN classification head. We extract image features from the last convolutional layer of the image encoder and we condense these feature maps into a feature vector (an image embedding) using global pooling. More specifically, we used the Generalized-Mean (GeM) pooling <ref type="bibr" target="#b24">[25]</ref> mechanism.</p><p>The FFNN component classifies the image into one or more concepts. Its output layer has |𝐶| neurons, where 𝐶 represents the set of unique concepts in the dataset. Each neuron uses a sigmoid activation function to transform its value into a probability value in [0, 1]. This results in one probability per label, and if this probability exceeds a specific threshold value 𝑡, the corresponding concept is assigned to the image. The threshold, which is the same for all concepts, was chosen through a grid search procedure that optimized the primary metric of the competition, on our validation set. The model was trained by minimizing binary cross-entropy, treating each concept as a separate binary target and summing up the individual losses. We used the Adam optimizer <ref type="bibr" target="#b25">[26]</ref>, along with a decreasing learning rate strategy and early stopping based on the validation set loss with a patience value of 3 epochs. We used an initial learning rate of 𝜂 = 10 −3 and decreasing factor of 10.</p><p>In order to form the ensembles, we trained several instances of the model, using different random initializations, and combined them using the union and the intersection of their predicted concept sets. More details about our submitted ensemble systems can be found in Section 4.1.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.">CNN + 𝑘-NN</head><p>For our 𝑘-nearest neighbors (𝑘-NN) approach, we leveraged the image embeddings obtained from the encoder of the trained CNN+FFNN system (Section. 3.1.1). We discarded the dense classification head and used the last GeM pooling layer to extract embeddings (feature vectors) for all the training images. These embeddings served as the basis for the retrieval process in the 𝑘-NN algorithm. Given a test image, the goal of the system is to retrieve similar images from the training set and select concepts from the retrieved neighbors. For each test image, we used the same encoder to obtain its embedding and we retrieved the 𝑘 closest neighbors from the training set, based on cosine similarity computed on the image embeddings. We tuned the value of 𝑘 in the range from 1 to 100 using our validation set, which led to 𝑘 = 33.</p><p>For each test image, having obtained its 𝑘 neighbors from the training set, we formed the set of concepts associated with the neighbors. We then ranked the concepts of the set based on the number of retrieved neighbors associated with each concept, ordering them from highest to lowest frequency. The concept with the highest frequency was always included in the predictions of the 𝑘-NN method for the test image. We then used two thresholds, 𝑡 1 and 𝑡 2 , which we tuned using grid search on our validation set, to select which other concepts of the neighborhood to include in the predictions of 𝑘-NN. We calculated the difference in frequency (Fr) between the first and second most frequent concepts, divided by the frequency of the first concept, and if the result exceeded 𝑡 1 , we included the second concept in the prediction:</p><formula xml:id="formula_1">Fr(concept 1 ) − Fr(concept 2 ) Fr(concept 1 ) ≥ 𝑡 1 .<label>(1)</label></formula><p>Similarly, we determined whether to include in the prediction the third most frequent concept or not, based on a comparison involving the first and third most frequent concepts. We calculated the difference between the frequencies of the first and third concepts, dividing it by the frequency of the first concept, and if this ratio exceeded 𝑡 2 , we included the third concept:</p><formula xml:id="formula_2">Fr(concept 1 ) − Fr(concept 3 ) Fr(concept 1 ) ≥ 𝑡 2 .<label>(2)</label></formula><p>The same approach was applied to the difference between the first and fourth most frequent concepts, checking again against 𝑡 2 , to decide if the fourth most frequent concept should be predicted:</p><formula xml:id="formula_3">Fr(concept 1 ) − Fr(concept 4 ) Fr(concept 1 ) ≥ 𝑡 2 .<label>(3)</label></formula><p>We opted to predict at most four concepts due to the fact that the average number of concepts in the training split was 3.08. The rationale was to select concepts that have frequencies close to that of the highest frequency concept, while excluding concepts that show a significant drop in frequency compared to the preceding ones. We experimented with 𝑡 1 , 𝑡 2 values ranging from 0.3 to 0.9. Validation results indicated that the best parameters were 𝑡 1 = 0.58 and 𝑡 2 = 0.65.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.">CNN + weighted 𝑘-NN</head><p>We also developed a weighted version of the 𝑘-NN algorithm, using the voting scheme that was described in <ref type="bibr" target="#b26">[27]</ref> </p><formula xml:id="formula_4">s.t. 1 ≥ 𝑤 1 ≥ . . . ≥ 𝑤 𝑘 ≥ 0 .<label>(5)</label></formula><p>In detail, we created a population of 500 randomly initialized weight vectors, initial chromosomes in GA terminology. Each chromosome had the form ⟨𝑤 1 , . . . , 𝑤 𝑘 ⟩, with all weights 𝑤 𝑖 ∈ [0, 1]; we ensured that the monotonicity constraint 1 ≥ 𝑤 1 ≥ . . . ≥ 𝑤 𝑘 ≥ 0 was satisfied by all chromosomes. We then used a crossover mechanism where two chromosomes were combined to form two new ones. At each application of the crossover mechanism, we selected pairs of chromosomes (parents) out of the population and combined their values to form two new ones from each pair of parents. The crossover operator splits the two parent chromosomes at a random point and creates two children chromosomes by combining the values before the crossover point (or after) for one parent, and after (or before) the crossover point for the other parent. Furthermore, we used a mutation mechanism that perturbed the values of the resulting children chromosomes by adding a random value in [−0.1, 0.1] to every gene, with a 0.1 mutation probability per gene (𝑤 𝑖 ). Both the crossover and the mutation operators paid respect to the range and monotonicity constraints; we added a clipping and a sorting operation that were applied if any of the constraints were violated in the resulting chromosomes. We used 𝐹 1 (𝑌 (𝑥), 𝐻(𝑥)) as the fitness function. The fitness function is used to select the chromosomes to be used as parents in the crossover mechanism at each iteration of the algorithm (fitter chromosomes are selected with higher probability as parents). At each generation (new population), we performed the crossover mechanism as many times as necessary to have a new generation with as many members as the previous one (and as many as the initial population, i.e., 500 chromosomes). We run the optimization process for 30 iterations (generations).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Caption Prediction</head><p>Our submissions for the Caption Prediction sub-task focused on four primary systems. The first system employs an InstructBLIP model <ref type="bibr" target="#b8">[9]</ref> (Section 3.2.1), while the remaining submissions build on this model using techniques such as rephrasing <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13]</ref> (Section 3.2.3) and synthesizing <ref type="bibr" target="#b11">[12]</ref> (Section 3.2.2). Finally, we implemented an innovative guided-decoding mechanism, DMMCS <ref type="bibr" target="#b6">[7]</ref> (Section 3.2.4), which leverages information from the tags predicted by our CNN+𝑘-NN classifier (Section 3.1.2) in the Concept Detection task to improve the generated caption.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.1.">InstructBLIP</head><p>The InstructBLIP model <ref type="bibr" target="#b8">[9]</ref> is a sophisticated neural network designed to generate descriptive text for scientific images. It employs a technique known as instruction-tuning <ref type="bibr" target="#b28">[29]</ref>, which refines its behavior and responses based on user-provided instructions. This approach aims to enhance the model's controllability and its adaptability across different domains. The InstructBLIP model comprises three key components: an image encoder, a Q-Former <ref type="bibr" target="#b29">[30]</ref>, and an LLM. The frozen image encoder converts the image into a low-dimensional vector and generates image embeddings. The Q-Former then extracts instruction-aware visual features from these embeddings and can process the text prompt (instruction) to enhance this extraction. Through extensive training, the LLM learns to correlate textual prompts with relevant image features, thereby generating coherent and contextually appropriate descriptions. The InstructBLIP model played a crucial role in creating the initial captions, which were subsequently utilized in our other caption prediction methods.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.2.">Synthesizer</head><p>Our goal was to the captions obtained from the InstructBLIP model (Section 3.2.1) by leveraging information from similar training images, based on the intuition that similar images may have similar captions <ref type="bibr" target="#b30">[31,</ref><ref type="bibr" target="#b31">32]</ref>. To achieve this, we computed embeddings for all images in the dataset using the CCN + FFNN model, which was developed for Concept Detection (Section 3.1.1). A cosine similarity threshold was then applied to decide if an image qualified as a neighbor of the test image. Images exceeding this threshold were considered neighbors <ref type="bibr" target="#b32">[33]</ref>. For each image in the test set <ref type="bibr" target="#b23">[24]</ref>, we identified the 𝑘 most similar images from the entire dataset <ref type="bibr" target="#b9">[10]</ref>, which includes training, validation, and development images, to retrieve their corresponding captions. We experimented with 𝑘 ∈ {1, 3, 5}; the best results in our validation set were obtained for 𝑘 = 5, so we used that value. The Synthesizer, a FLAN-T5 model <ref type="bibr" target="#b10">[11]</ref>, was trained to refine the captions generated by InstructBLIP by considering also the captions of the neighbors, which are concatenated to the caption of InstructBLIP, similarly in spirit to <ref type="bibr" target="#b12">[13]</ref>. We also experimented with different beam sizes 𝑚, for the beam search decoding of the Synthesizer during inference; setting 𝑚 = 5 yielded the best validation scores, so we used that value. Figure <ref type="figure" target="#fig_4">4</ref> illustrates the process (for 𝑚 = 3), starting with the caption generated by InstructBLIP, merging it with the captions of the neighbors, and using FLAN-T5 to obtain a refined caption. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.3.">Rephraser</head><p>Furthermore, we experimented with a domain-specific variation of T5, namely ClinicalT5. This is an encoder-decoder transformer, which is pre-trained in a series of both supervised and unsupervised tasks <ref type="bibr" target="#b33">[34]</ref>, including denoising tasks, and then further pre-trained on the union of MIMIC-III and IV clinical notes, to which we were granted access through PhysioNet<ref type="foot" target="#foot_4">6</ref> . Following our previous work <ref type="bibr" target="#b34">[35]</ref>, we created a corrective text-to-text training set, consisting of noisy and ground truth caption pairs, with the former having been generated by our captioning systems. Therefore, we treated our original system as a noise-insertion function, then we further fine-tuned ClinicalT5, in order to rephrase the noisy captions to approximate the gold ones, hoping it would acquire knowledge of the medical domain, use medical terms more accurately and therefore generate more medically fluent text captions. Specifically, we fine-tuned ClinicalT5 to rephrase the captions of InstructBlip (Section 3.2.1), InstructBlip with FLAN-T5 Synthesizer (Section 3.2.2) on top and InstructBlip with DMMCS (Section 3.2.4) using 𝛼 = 0.10. Performance in terms of the primary metric in our development set improved, but test-time performance (in the official evaluation) deteriorated.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.4.">DMMCS</head><p>In this section, we present "Distance from Median Maximum Concept Similarity" (DMMCS) <ref type="bibr" target="#b6">[7]</ref>, a novel data-driven guided decoding mechanism designed to incorporate domain-specific information (in the form of keywords) into the text generation process. The intuition behind this guided decoding algorithm lies in the observation that an accurate diagnostic caption should mention the key medical conditions depicted in the given image. For example, if a radiology image is assigned the tag "Pneumonia", but the generated caption does not refer to this medical condition either explicitly or implicitly, then the caption is potentially inaccurate. Such conditions are typically represented by the medical tags provided in the ImageCLEF2024 dataset, which the Concept Detection task is also trying to predict. Therefore we use tags predicted by one of our Concept Detection systems (Section 3.1), in order to guide our Caption Prediction models towards captions that express the tags appropriately. We achieve this by imposing a new penalty at each decoding step, aiming to prioritize the generation of words semantically similar to the (predicted) medical tags. This penalty also considers the frequency with which each tag is explicitly or implicitly expressed in the dataset's gold captions.</p><p>In more detail, recent work examining DC datasets <ref type="bibr" target="#b21">[22,</ref><ref type="bibr" target="#b6">7]</ref> has shown that some tags are more prominently expressed than others in the corresponding diagnostic captions. More specifically, Kaliosis et al. <ref type="bibr" target="#b6">[7]</ref> performed an exploratory analysis on the ImageCLEF2023 and MIMIC-CXR datasets, where they investigated the relationship between each tag and the gold captions of the images that are associated with the tag in the ground truth. This was achieved by calculating the cosine similarity between the word embeddings of each caption's tokens and each tag. The results showed that some tags are always explicitly expressed in the gold captions of the images the tags are associated with, while other tags are mentioned more implicitly or even not at all. More concretely, the similarity between a tag 𝑡 and a caption 𝑐 is defined as the maximum cosine similarity (MCS) between the centroid ℎ(𝑡) of the word embeddings of 𝑡 and the embedding ℎ(𝑐 𝑖 ) of each token in 𝑐, i.e.,</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>MCS(𝑡</head><formula xml:id="formula_5">, 𝑐) = max 1≤𝑖≤|𝑐| sim(ℎ(𝑡), ℎ(𝑐 𝑖 )).<label>(7)</label></formula><p>A high MCS score between a tag 𝑡 and a caption 𝑐 implies that 𝑡 is strongly expressed in the caption, while a low MCS score indicates that it was rather implicitly (or not at all) mentioned. The MCS similarity is also calculated for all the gold captions of the images a tag 𝑡 is associated with in the training data. Specifically, for each tag 𝑡 and the set 𝐶 containing its associated captions, the distribution 𝑅(𝑡, 𝐶) is calculated as:</p><formula xml:id="formula_6">𝑅(𝑡, 𝐶) = {MCS(𝑡, 𝑐)|𝑐 ∈ 𝐶}.<label>(8)</label></formula><p>The median value of the distribution 𝑅(𝑡, 𝐶), hereafter called Median Maximum Cosine Similarity (MMCS), indicates how strongly 𝑡 is expressed on average in the training captions it is associated with.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>MMCS(𝑡</head><formula xml:id="formula_7">, 𝐶) = median(𝑅(𝑡, 𝐶)).<label>(9)</label></formula><p>During inference, when generating the caption for an image with a single tag 𝑡, the MCS(𝑡, 𝑐) of the tag 𝑡 and each candidate (possibly still incomplete) caption 𝑐 of the beam search is calculated (Eq. 7). The penalty, imposed at each decoding step, is then defined as the squared difference between MCS(𝑡, 𝑐) and MMCS(𝑡, 𝐶). The former shows how strongly the tag is mentioned in the candidate caption, while the latter indicates how strongly the tag is expressed on average in the gold training captions associated with the tag. When more than one tags are assigned to an image, a distinct penalty is calculated for each tag, and the overall penalty is the average of the individual penalties. Thus, given a candidate caption 𝑐, the set of its associated training captions 𝐶, and a set of tags 𝑇 , the penalty is calculated as:</p><formula xml:id="formula_8">DMMCS pen (𝑇, 𝐶, 𝑐) = 1 |𝑇 | ∑︁ 𝑡∈𝑇 (MCS(𝑡, 𝑐) − MMCS(𝑡, 𝐶)) 2 .<label>(10)</label></formula><p>Intuitively, the objective of the DMMCS algorithm is to guide the model to generate captions that express each associated tag as explicitly (or implicitly) as it is expressed in the training corpus. Overall, at each decoding step, each candidate caption 𝑐 generated through the beam search process is scored by the following formula:</p><formula xml:id="formula_9">DMMCS(𝑐) = 𝛼 • DMMCS pen (𝑇, 𝐶, 𝑐) + (1 − 𝛼) • (1 − D score ),<label>(11)</label></formula><p>where 𝑇 is a given set of predicted tags, 𝛼 is a tunable weighting factor, while D score is the score that the decoder assigns to the candidate caption 𝑐.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experiments, Submissions and Results</head><p>In this section, we provide details about our experiments regarding this year's evaluation campaign <ref type="bibr" target="#b0">[1]</ref>. Moreover, we share details about our submissions and the scores achieved in our held-out development set, as well as the official test set of the competition <ref type="bibr" target="#b23">[24]</ref> for both sub-tasks.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Concept Detection</head><p>In the Concept Detection sub-task we submitted our ten best performing models, after evaluating them on our held-out development set. We submitted two instances with different image encoders of our CNN + FFNN model (Section 3.1.1), one instance of our CNN + 𝑘-NN model (Section 3.1.2), and a single instance of our CNN + weighted 𝑘-NN model (Section 3.1.3). In our subsequent submissions, we employed ensemble systems. These involved exploring the integration of predictions from multiple instances by computing either the union or the intersection of their predicted concept sets. Our submitted ensemble systems consisted of various combinations of CNN-based architectures paired with different classifiers, specifically CNN + FFNN, CNN + 𝑘-NN (KNN), and CNN + weighted 𝑘-NN (wKNN). To enhance the diversity and robustness of our ensembles, we incorporated different architectures for the CNN component.</p><p>The primary evaluation metric for this year's Concept Detection sub-task was the 𝐹 1 -score, calculated between the predicted and ground truth captions. It is calculated as the sum of the 𝐹 1 -scores for each test image, divided by the total number of test images. Each partial score is derived from the binary multi-hot candidate vector compared to the corresponding ground truth vector. Specifically, let 𝐹 1 represent the overall 𝐹 1 -score, and 𝑓 1 ^denote the individual 𝐹 1 -score for each test image. Additionally, let 𝑝 𝑡 and 𝑔 𝑡 be the predicted and ground truth concepts for an image 𝑡, respectively. Finally, let 𝑇 be the test set <ref type="bibr" target="#b23">[24]</ref>.</p><formula xml:id="formula_10">𝐹 1 = 1 |𝑇 | ∑︁ 𝑡∈𝑇 𝑓 1 ^(𝑝 𝑡 , 𝑔 𝑡 )<label>(6)</label></formula><p>Moreover, a secondary evaluation metric (again an 𝐹 1 score) was calculated, which only considered manually selected concepts, such as anatomy, topography, and modality.</p><p>For our first system (CNN+FFNN), we experimented with a variety of CNN encoders as their backbone components. Specifically, we trained the networks using state-of-the-art CNN architectures, including EfficientNet and DenseNet. Furthermore, we extended our experiments by incorporating these CNN encoders into our 𝑘-NN models.</p><p>During testing on our held-out development set, we observed a slightly higher F1 score in models utilizing the EfficientNet image encoder.</p><p>Our ensembling approaches did not show significant improvement over our individual models, with minimal differences observed in both the development and test set <ref type="bibr" target="#b23">[24]</ref>. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Caption Prediction</head><p>For the Caption Prediction sub-task, we submitted nine systems based on their performance on our development set. Our submissions included InstructBLIP (Section 3.2.1), a synthesizer variant combining InstructBLIP with FLAN-T5 (Section 3.2.2), and a rephrasing variant that employs ClinicalT5 (Section 3.2.3). Additionally, we explored combinations of all three approaches, aiming to refine the captions generated by InstructBLIP and FLAN-T5 (Section 3.2.2) using our ClinicalT5 rephraser on top. Furthermore, we submitted three variations of InstructBLIP and DMMCS, each with a different 𝛼 value (Section 3.2.4). Finally, we provided two instances where we employed ClinicalT5 to rephrase the results generated by the combination of InstructBLIP and DMMCS, in this case using a 𝛼 = 0.10.</p><p>In this year's campaign, BERTScore <ref type="bibr" target="#b35">[36]</ref> was the primary evaluation metric in the Caption Prediction task, while ROUGE <ref type="bibr" target="#b36">[37]</ref> was the secondary metric. Other metrics utilized include, for example, BLEU-1 <ref type="bibr" target="#b37">[38]</ref>, BLEURT <ref type="bibr" target="#b38">[39]</ref>, and METEOR <ref type="bibr" target="#b39">[40]</ref>. Table <ref type="table" target="#tab_5">6</ref> shows captions produced by each of our submissions for the test image CC BY <ref type="bibr">[Muacevic et al. (2024)</ref>], extracted from the test dataset <ref type="bibr" target="#b23">[24]</ref>.</p><p>Finally, Table <ref type="table">7</ref> provides an overview of our models, detailing their performance across fundamental campaign metrics in both our development set and the provided test set <ref type="bibr" target="#b23">[24]</ref>, along with our attained Anteroposterior radiograph of the pelvis showing a large rightsided pleural effusion.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 7</head><p>Summary of the scores of our submissions to the ImageCLEFmedical2024 Caption Prediction sub-task. rankings. Additionally, Table <ref type="table" target="#tab_7">8</ref> presents a summary of all the metrics utilized in this year's campaign, offering a comprehensive view of the experiments.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>AUEB NLP Group -Submission</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusion</head><p>Our participation in the ImageCLEFmedical Caption task provided an opportunity to explore innovative NLP approaches for medical image captioning. Utilizing state-of-the-art models, we demonstrated competitive performance in both the Concept Detection and Caption Prediction sub-tasks.</p><p>In the Concept Detection sub-task, we achieved a 2 nd place ranking among the participating groups. Our top-performing system was a CNN+FFNN pipeline (Section 3.1.1), while our remaining submissions included a CNN+KNN (Section 3.1.2) and a CNN+wKNN (Section 3.1.3), which also produced competitive results. We also employed ensembles that combined these approaches using union and intersection (of predicted tags) approaches.</p><p>In the Caption Prediction sub-task, we were ranked 4 th among all participating groups, by both extending our previous work <ref type="bibr" target="#b21">[22,</ref><ref type="bibr" target="#b20">21,</ref><ref type="bibr" target="#b16">17]</ref> and exploiting the state-of-the-art in NLP, such as instructiontuned Large Language Models. Our approach involved the initial generation of captions using the InstructBLIP model <ref type="bibr" target="#b8">[9]</ref>, followed by their enrichment through the synthesis of information from the captions of similar images <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13]</ref> and the utilization of a model further pre-trained in the medical domain <ref type="bibr" target="#b13">[14]</ref> to improve the originally generated captions.</p><p>In future work, we plan to further investigate and improve biomedical LLMs and further explore their reasoning capabilities through instruction tuning and, more generally, alignment with medical professionals needs <ref type="bibr" target="#b40">[41]</ref>. We also plan to utilize a model capable of processing both image and text inputs in our Synthesizer approach (Section 3.2.2) to combine information not only from the captions of the neighbors, but also from the images themselves. Furthermore, we plan to exploit Retrieval-Augmented Generation <ref type="bibr" target="#b41">[42]</ref> algorithms to combine prior knowledge with new medical cases. Finally, the generated captions need to be evaluated in collaboration with medical experts, to assess their medical accuracy and usefulness.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head></head><label></label><figDesc>Magdás et al. (2021)]</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: CC BY [Magdás et al. (2021)] from the ImageCLEFmedical2024 dataset, along with the corresponding CUIs and UMLS terms.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: (a) Visualization of the dataset's long-tail distribution. The y-axis shows the number of occurrences of each concept, and the x-axis the concept's class index. (b) Histogram with 25 fixed-size bins (horizontal axis) depicting the number of gold concepts per image. Note that 13 concepts do not have corresponding UMLS terms.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 3 :</head><label>3</label><figDesc>Figure 3: (a) Histogram visualizing the distribution of caption lengths. The 𝑦-axis, displayed on a logarithmic scale, represents the number of images falling into each bin, while the 𝑥-axis shows the number of words in the captions. (b) Box-plot illustrating the same distribution, with the 𝑦-axis displayed on a logarithmic scale, highlighting outliers in the range of 100 to 200 words.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 4 :</head><label>4</label><figDesc>Figure 4: Illustration of a radiology image (CC BY [Muacevic et al., 2024]), accompanied by similar neighbor images (CC BY-NC [Popa et al., 2014], CC BY-NC [Popa et al., 2014], CC BY-NC [Bang et al., 2015]) and their corresponding captions from the 2024 ImageCLEFmedical caption task<ref type="bibr" target="#b9">[10,</ref><ref type="bibr" target="#b23">24]</ref>. The initial caption, generated by InstructBLIP, is concatenated with the captions of the neighbors and is then fed to a FLAN-T5 Synthesizer, which generates a refined caption.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head></head><label></label><figDesc>. More specifically, given a test image 𝑥, we calculate for each concept 𝑐 𝑖 ∈ 𝐶 a score 𝑓 𝑖 (𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ) from the 𝑘 neighbors retrieved for 𝑥: 𝑓 𝑖 (𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ) = 𝑦 𝑖,𝑗,𝑥 = 1 if concept 𝑐 𝑖 is present in the ground truth of the 𝑗-th neighbor of 𝑥, otherwise 𝑦 𝑖,𝑗,𝑥 = 0, and 𝑤 𝑗 is the weight assigned to the 𝑗-th nearest neighbor position; we explain below how the weights 𝑤 𝑗 are learned. Concept 𝑐 𝑖 is predicted for the test image 𝑥 if and only if 𝑓 𝑖 (𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ) ≥ 𝑡, yielding the predicted label set 𝐻(𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ) = {𝑐 𝑖 |𝑓 𝑖 (𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ) ≥ 𝑡}. The classification threshold 𝑡 ∈ [0, 1] and the number of neighbors 𝑘 ∈ [1, 100] were tuned on our validation set, resulting in 𝑡 = 0.35 and 𝑘 = 50. The weights 𝑤 1 , . . . , 𝑤 𝑘 are the same for all the concepts 𝑐 𝑖 and test images 𝑥. They are learned using a genetic algorithm (GA)<ref type="bibr" target="#b27">[28]</ref> by maximizing the following objective, where 𝑉 denotes the validation set, 𝑌 (𝑥) is the ground truth set of concepts of image 𝑥, and 𝐹 1 is the official evaluation measure of the Concept Detection task:</figDesc><table><row><cell></cell><cell>∑︀ 𝑘 𝑗=1 𝑤 𝑗 • 𝑦 𝑖,𝑗,𝑥 𝑗=1 𝑤 𝑗 ∑︀ 𝑘</cell><cell>(4)</cell></row><row><cell>𝑤 1 ,...,𝑤 𝑘 where max</cell><cell>∑︁</cell></row></table><note>𝑥∈𝑉𝐹 1 (𝑌 (𝑥), 𝐻(𝑥; 𝑤 1 , . . . , 𝑤 𝑘 ))</note></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4 Summary of the scores of our individual experiments (ensembles included) in the Image- CLEFmedical2024 Concept Detection sub-task.</head><label>4</label><figDesc>This table presents the highest scores of our systems on our held-out development set for each method.</figDesc><table><row><cell></cell><cell cols="2">Individual Concept Detection Experiments</cell></row><row><cell>Run ID</cell><cell>Method</cell><cell>Development</cell></row><row><cell>619</cell><cell>CNN+FFNN (DenseNet)</cell><cell>0.6007</cell></row><row><cell>624</cell><cell>CNN+KNN</cell><cell>0.6007</cell></row><row><cell>640</cell><cell>INTERSECTION(UNION(3xCNN+FFNN),624)</cell><cell>0.6022</cell></row><row><cell>642</cell><cell>UNION(2xCNN+FFNN)</cell><cell>0.6047</cell></row><row><cell>644</cell><cell>CNN+FFNN (EfficientNet)</cell><cell>0.6042</cell></row><row><cell>648</cell><cell>UNION(644,624)</cell><cell>0.6045</cell></row><row><cell>651</cell><cell>CNN+wKNN</cell><cell>0.5961</cell></row><row><cell>654</cell><cell>UNION(651,644)</cell><cell>0.6008</cell></row><row><cell>655</cell><cell>UNION(651,624)</cell><cell>0.5970</cell></row><row><cell>656</cell><cell>UNION(651,619)</cell><cell>0.5981</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 5 Summary of our submissions to the ImageCLEFmedical2024 Concept Detection sub-task.</head><label>5</label><figDesc>The table presents the scores of our systems on both our held-out development set and the official test set<ref type="bibr" target="#b23">[24]</ref>. It also includes the rankings of these systems among all submissions from the 9 participating teams.</figDesc><table><row><cell></cell><cell cols="3">Individual Concept Detection Experiments</cell><cell></cell></row><row><cell>Run ID</cell><cell>Method</cell><cell cols="2">Primary F1</cell><cell cols="2">Secondary F1 Rank</cell></row><row><cell></cell><cell></cell><cell>Dev</cell><cell>Test</cell><cell></cell></row><row><cell>619</cell><cell>CNN+FFNN (DenseNet)</cell><cell cols="2">0.6007 0.6240</cell><cell>0.9339</cell><cell>12</cell></row><row><cell>624</cell><cell>CNN+KNN</cell><cell cols="2">0.6007 0.6274</cell><cell>0.9375</cell><cell>8</cell></row><row><cell>640</cell><cell>INTERSECTION(UNION(3xCNN+FFNN),624)</cell><cell cols="2">0.6022 0.6272</cell><cell>0.9415</cell><cell>10</cell></row><row><cell>642</cell><cell>UNION(2xCNN+FFNN)</cell><cell cols="2">0.6047 0.6304</cell><cell>0.9332</cell><cell>7</cell></row><row><cell>644</cell><cell>CNN+FFNN (EfficientNet)</cell><cell cols="2">0.6042 0.6319</cell><cell>0.9392</cell><cell>4</cell></row><row><cell>648</cell><cell>UNION(644,624)</cell><cell cols="2">0.6045 0.6308</cell><cell>0.9321</cell><cell>6</cell></row><row><cell>651</cell><cell>CNN+wKNN</cell><cell cols="2">0.5961 0.6135</cell><cell>0.9238</cell><cell>17</cell></row><row><cell>654</cell><cell>UNION(651,644)</cell><cell cols="2">0.6008 0.6207</cell><cell>0.9243</cell><cell>13</cell></row><row><cell>655</cell><cell>UNION(651,624)</cell><cell cols="2">0.5970 0.6155</cell><cell>0.9233</cell><cell>16</cell></row><row><cell>656</cell><cell>UNION(651,619)</cell><cell cols="2">0.5981 0.6162</cell><cell>0.9217</cell><cell>15</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head>Table 6</head><label>6</label><figDesc>Captions generated by our submitted models for the test image<ref type="bibr" target="#b23">[24]</ref> CC BY[Muacevic et al. (2024)]    </figDesc><table><row><cell></cell><cell>Generated captions</cell></row><row><cell>InstructBLIP</cell><cell>Diffusion-weighted magnetic resonance imaging of the brain</cell></row><row><cell></cell><cell>showing a hyperintense lesion in the right temporal lobe.</cell></row><row><cell>InstructBLIP + Synthesizer</cell><cell>magnetic resonance imaging of the head and neck showing a</cell></row><row><cell></cell><cell>hyperintense lesion in the right internal carotid.</cell></row><row><cell>InstructBLIP + Rephraser</cell><cell>Axial computed tomography scan of the head showing a mass</cell></row><row><cell></cell><cell>in the left maxillary sinus (arrow).</cell></row><row><cell>InstructBLIP + Synthesizer +</cell><cell>Computed tomography scan of the head and neck showing a</cell></row><row><cell>Rephraser</cell><cell>mass in the right parotid gland.</cell></row><row><cell cols="2">InstructBLIP + DMMCS (alpha 0.1) Chest X-ray showing bilateral pulmonary edema.</cell></row><row><cell>InstructBLIP + DMMCS (alpha 0.1)</cell><cell>Computed tomography scan of the head and neck showing a</cell></row><row><cell>+ Rephraser</cell><cell>mass in the right parotid gland.</cell></row><row><cell>InstructBLIP + DMMCS (alpha 0.1)</cell><cell></cell></row><row><cell>+ Rephraser (random restart)</cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_6"><head>Table Run</head><label>Run</label><figDesc></figDesc><table><row><cell>ID</cell><cell>Approach</cell><cell cols="2">BERTScore</cell><cell cols="2">ROUGE-1</cell><cell>Rank</cell></row><row><cell></cell><cell></cell><cell>Dev</cell><cell>Test</cell><cell>Dev</cell><cell>Test</cell></row><row><cell>564</cell><cell>InstructBLIP</cell><cell cols="4">0.6164 0.6152 0.1931 0.2052</cell><cell>22</cell></row><row><cell>577</cell><cell>InstructBLIP + Rephraser</cell><cell cols="4">0.7651 0.6106 0.1840 0.1837</cell><cell>26</cell></row><row><cell>605</cell><cell>InstructBLIP + Synthesizer</cell><cell cols="4">0.6194 0.6113 0.1898 0.1889</cell><cell>24</cell></row><row><cell>630</cell><cell>InstructBLIP + DMMCS</cell><cell cols="4">0.6564 0.6211 0.2027 0.2048</cell><cell>10</cell></row><row><cell></cell><cell>(𝛼 = 0.1)</cell><cell></cell><cell></cell><cell></cell></row><row><cell>635</cell><cell>InstructBLIP + DMMCS</cell><cell cols="4">0.6534 0.6210 0.2025 0.2047</cell><cell>11</cell></row><row><cell></cell><cell>(𝛼 = 0.05)</cell><cell></cell><cell></cell><cell></cell></row><row><cell>639</cell><cell>InstructBLIP + Synthesizer +</cell><cell cols="4">0.7603 0.6111 0.1840 0.1827</cell><cell>25</cell></row><row><cell></cell><cell>Rephraser</cell><cell></cell><cell></cell><cell></cell></row><row><cell>647</cell><cell>InstructBLIP + DMMCS (𝛼 = 0.1)</cell><cell cols="4">0.7981 0.6209 0.1928 0.1807</cell><cell>13</cell></row><row><cell></cell><cell>+ ClinicalT5</cell><cell></cell><cell></cell><cell></cell></row><row><cell>650</cell><cell>InstructBLIP + DMMCS (𝛼 = 0.1)</cell><cell cols="4">0.8012 0.6159 0.1932 0.1936</cell><cell>20</cell></row><row><cell></cell><cell>+ ClinicalT5 (random restart)</cell><cell></cell><cell></cell><cell></cell></row><row><cell>646</cell><cell>InstructBLIP + DMMCS</cell><cell cols="4">0.6530 0.6209 0.2024 0.2044</cell><cell>12</cell></row><row><cell></cell><cell>(𝛼 = 0.15)</cell><cell></cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_7"><head>Table 8 Summary of our submissions regarding the Caption Prediction sub-task.</head><label>8</label><figDesc>The table contains each system's performance on all officially reported measures.</figDesc><table /><note>AUEB NLP</note></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_8"><head>Group Submissions -Evaluation on All Metrics Run ID BERTScore ROUGE BLEU-1 BLEURT METEOR CIDEr CLIPscore RefCLIPscore ClinicalBLEURT MedBERTScore Rank</head><label></label><figDesc></figDesc><table><row><cell>630</cell><cell>0.6211</cell><cell>0.2049</cell><cell>0.1110</cell><cell>0.2899</cell><cell>0.0680</cell><cell>0.1769</cell><cell>0.8041</cell><cell>0.7987</cell><cell>0.4866</cell><cell>0.6261</cell><cell>10</cell></row><row><cell>635</cell><cell>0.6210</cell><cell>0.2047</cell><cell>0.1108</cell><cell>0.2895</cell><cell>0.0680</cell><cell>0.1762</cell><cell>0.8040</cell><cell>0.7986</cell><cell>0.4870</cell><cell>0.6260</cell><cell>11</cell></row><row><cell>646</cell><cell>0.6210</cell><cell>0.2044</cell><cell>0.1107</cell><cell>0.2900</cell><cell>0.0678</cell><cell>0.1758</cell><cell>0.8041</cell><cell>0.7988</cell><cell>0.4872</cell><cell>0.6261</cell><cell>12</cell></row><row><cell>647</cell><cell>0.6210</cell><cell>0.1807</cell><cell>0.0860</cell><cell>0.2846</cell><cell>0.0580</cell><cell>0.1459</cell><cell>0.7936</cell><cell>0.7912</cell><cell>0.5021</cell><cell>0.6291</cell><cell>13</cell></row><row><cell>650</cell><cell>0.6160</cell><cell>0.1936</cell><cell>0.1050</cell><cell>0.2859</cell><cell>0.0638</cell><cell>0.1597</cell><cell>0.7980</cell><cell>0.7948</cell><cell>0.4874</cell><cell>0.6212</cell><cell>20</cell></row><row><cell>564</cell><cell>0.6153</cell><cell>0.2052</cell><cell>0.1274</cell><cell>0.2920</cell><cell>0.0698</cell><cell>0.1728</cell><cell>0.8045</cell><cell>0.7968</cell><cell>0.4844</cell><cell>0.6197</cell><cell>22</cell></row><row><cell>605</cell><cell>0.6114</cell><cell>0.1889</cell><cell>0.1147</cell><cell>0.2796</cell><cell>0.0616</cell><cell>0.1305</cell><cell>0.8037</cell><cell>0.7962</cell><cell>0.4834</cell><cell>0.6174</cell><cell>24</cell></row><row><cell>639</cell><cell>0.6111</cell><cell>0.1827</cell><cell>0.0744</cell><cell>0.2717</cell><cell>0.0515</cell><cell>0.1293</cell><cell>0.7858</cell><cell>0.7845</cell><cell>0.5212</cell><cell>0.6141</cell><cell>25</cell></row><row><cell>577</cell><cell>0.6107</cell><cell>0.1838</cell><cell>0.0751</cell><cell>0.2706</cell><cell>0.0513</cell><cell>0.1292</cell><cell>0.7832</cell><cell>0.7826</cell><cell>0.5158</cell><cell>0.6134</cell><cell>26</cell></row></table></figure>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_0">https://github.com/nlpaueb/dmmcs, Last accessed: 2024-06-20.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_1">https://github.com/nlpaueb/imageclef2024, Last accessed: 2024-06-20.</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_2">PMC Open Access: https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/, Last accessed: 2024-06-20</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="5" xml:id="foot_3">UMLS: https://www.nlm.nih.gov/research/umls/index.html, Last accessed: 2024-06-20</note>
			<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="6" xml:id="foot_4">https://www.physionet.org/content/clinical-t5/1.0.0/, Last accessed: 2024-06-20</note>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This work has been partially supported by project MIS 5154714 of the National Recovery and Resilience Plan Greece 2.0 funded by the European Union under the NextGenerationEU Program.</p></div>
			</div>


			<div type="funding">
<div xmlns="http://www.tei-c.org/ns/1.0"><p>(I. Androutsopoulos) https://www.linkedin.com/in/marina-samprovalaki/ (M. Samprovalaki); https://www.linkedin.com/in/anna-chatzipapadopoulou/ (A. Chatzipapadopoulou); https://geomos.sites.aueb.gr/ (G. Moschovis); https://pkaliosis.github.io (P. Kaliosis); https://ipavlopoulos.github.io/ (J. Pavlopoulos); https://www.aueb.gr/users/ion/ (I. Androutsopoulos) 0000-0003-0547-0581 (G. Moschovis); 0000-0001-9188-742 (J. Pavlopoulos)</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEF 2024: Multimedia retrieval in medical applications</title>
		<author>
			<persName><forename type="first">B</forename><surname>Ionescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Drăgulinescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Garcıa Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M G</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Andrei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Prokopchuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Karpenka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radzhabov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kovalev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Macaire</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schwab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Lecouteux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Esperança-Rodier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yetisgen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Hicks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Riegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Thambawita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Storås</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Halvorsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Heinrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kiesel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Potthast</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Stein</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Experimental IR Meets Multilinguality, Multimodality, and Interaction, Proceedings of the 15th International Conference of the CLEF Association (CLEF 2024</title>
		<title level="s">Springer Lecture Notes in Computer Science LNCS</title>
		<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEFmedical 2024 -Caption Prediction and Concept Detection</title>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M G</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2024 Working Notes, CEUR Workshop Proceedings</title>
				<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Diagnostic Captioning: A Survey</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Papamichail</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2101.07299</idno>
	</analytic>
	<monogr>
		<title level="j">Knowledge and Information Systems</title>
		<imprint>
			<biblScope unit="volume">64</biblScope>
			<biblScope unit="page" from="1" to="32" />
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Learning to Read Chest X-Rays: Recurrent Neural Cascade Model for Automated Image Annotation</title>
		<author>
			<persName><forename type="first">H.-C</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Demner-Fushman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Summers</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.1603.08486</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="2497" to="2506" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<title level="m" type="main">Medical image captioning based on Deep Architectures</title>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<ptr target="http://urn.kb.se/resolve?urn=urn:nbn:se:kth:diva-323528" />
		<imprint>
			<date type="published" when="2022">2022. 2024-06-20</date>
			<pubPlace>Stockholm, Sweden</pubPlace>
		</imprint>
		<respStmt>
			<orgName>KTH Royal Institute of Technology</orgName>
		</respStmt>
	</monogr>
	<note type="report_type">Master&apos;s thesis</note>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">A Survey on Biomedical Image Captioning</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/W19-1803</idno>
		<ptr target="https://aclanthology.org/W19-1803.doi:10.18653/v1/W19-1803" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Second Workshop on Shortcomings in Vision and Language, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">R</forename><surname>Bernardi</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Fernandez</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Gella</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Kafle</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Kanan</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Lee</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">M</forename><surname>Nabi</surname></persName>
		</editor>
		<meeting>the Second Workshop on Shortcomings in Vision and Language, Association for Computational Linguistics<address><addrLine>Minneapolis, Minnesota</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019">2019. 2024-06-20</date>
			<biblScope unit="page" from="26" to="36" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">A data-driven guided decoding mechanism for diagnostic captioning</title>
		<author>
			<persName><forename type="first">P</forename><surname>Kaliosis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Charalampakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: ACL 2024</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">X</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Hou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Min</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Dong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Du</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-Y</forename><surname>Nie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-R</forename><surname>Wen</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2303.18223</idno>
		<idno type="arXiv">arXiv:2303.18223</idno>
		<title level="m">A Survey of Large Language Models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning</title>
		<author>
			<persName><forename type="first">W</forename><surname>Dai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M H</forename><surname>Tiong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">N</forename><surname>Fung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hoi</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2305.06500</idno>
	</analytic>
	<monogr>
		<title level="j">Advances in Neural Information Processing Systems</title>
		<imprint>
			<biblScope unit="volume">36</biblScope>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Radiology Objects in COntext (ROCO): A Multimodal Image Dataset</title>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Friedrich</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-3-030-01364-6_20</idno>
	</analytic>
	<monogr>
		<title level="m">7th Joint International Workshop, CVII-STENT 2018 and Third International Workshop</title>
				<meeting><address><addrLine>LABELS; Granada, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2018-09-16">2018. September 16, 2018. 2018</date>
			<biblScope unit="page" from="180" to="189" />
		</imprint>
	</monogr>
	<note>Proceedings</note>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Scaling Instruction-Finetuned Language Models</title>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">W</forename><surname>Chung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Hou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Longpre</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Zoph</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tay</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Fedus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dehghani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Brahma</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2210.11416</idno>
	</analytic>
	<monogr>
		<title level="j">Journal of Machine Learning Research</title>
		<imprint>
			<biblScope unit="volume">25</biblScope>
			<biblScope unit="page" from="1" to="53" />
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Knowledge-Driven Encode, Retrieve, Paraphrase for Medical Image Report Generation</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Xing</surname></persName>
		</author>
		<idno type="DOI">10.1609/aaai.v33i01.33016666</idno>
	</analytic>
	<monogr>
		<title level="m">AAAI Conference on Artificial Intelligence</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Small Language Models Improve Giants by Rewriting Their Outputs</title>
		<author>
			<persName><forename type="first">G</forename><surname>Vernikos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Brazinskas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Adamek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mallinson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Severyn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Malmi</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2305.13514</idno>
		<ptr target="https://aclanthology.org/2024.eacl-long.165.doi:10.48550/arXiv.2305.13514" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics</title>
		<title level="s">Long Papers</title>
		<editor>
			<persName><forename type="first">Y</forename><surname>Graham</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">M</forename><surname>Purver</surname></persName>
		</editor>
		<meeting>the 18th Conference of the European Chapter of the Association for Computational Linguistics<address><addrLine>St. Julians, Malta</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024-06-20">2024. 2024-06-20</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="2703" to="2718" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">ClinicalT5: A Generative Language Model for Clinical Text</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Nguyen</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2022.findings-emnlp.398</idno>
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: EMNLP 2022</title>
				<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="5436" to="5443" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</title>
		<author>
			<persName><forename type="first">A</forename><surname>Dosovitskiy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Beyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kolesnikov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Weissenborn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Unterthiner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dehghani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Minderer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Heigold</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gelly</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Houlsby</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2010.11929</idno>
		<ptr target="https://openreview.net/forum?id=YicbFdNTTy.doi:10.48550/arXiv.2010.11929" />
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2021">2021. 2024-06-20</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Weakly-Supervised Semantic Segmentation via Transformer Explainability</title>
		<author>
			<persName><forename type="first">I</forename><surname>Athanasiadis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tuoma</surname></persName>
		</author>
		<idno type="DOI">10.5281/zenodo.6574631</idno>
	</analytic>
	<monogr>
		<title level="m">ML Reproducibility Challenge</title>
				<imprint>
			<date type="published" when="2021">2021. 2022</date>
		</imprint>
	</monogr>
	<note>Fall Edition</note>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">NeuralDynamicsLab at ImageCLEF Medical</title>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Fransén</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2022 Working Notes, CEUR Workshop Proceedings</title>
				<meeting><address><addrLine>Bologna, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022. 2022</date>
		</imprint>
		<respStmt>
			<orgName>CEUR-WS.org</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">AUEB NLP Group at ImageCLEFmed Caption</title>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2019 -Conference and Labs of the Evaluation Forum</title>
				<meeting><address><addrLine>Lugano, Switzerland</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019-09-09">2019. September 9-12. 2380. 2019</date>
		</imprint>
	</monogr>
	<note>CEUR Workshop Proceedings</note>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">AUEB NLP Group at ImageCLEFmed Caption</title>
		<author>
			<persName><forename type="first">B</forename><surname>Karatzas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2020 -Conference and Labs of the Evaluation Forum</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2020-09-22">2020. September 22-25. 2696. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">AUEB NLP Group at ImageCLEFmed Caption Tasks</title>
		<author>
			<persName><forename type="first">F</forename><surname>Charalampakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karatzas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Working Notes of CLEF 2021 -Conference and Labs of the Evaluation Forum</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting>the Working Notes of CLEF 2021 -Conference and Labs of the Evaluation Forum<address><addrLine>Bucharest, Romania</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2021-09-21">2021. September 21-24. 2936. 2021</date>
			<biblScope unit="page" from="1184" to="1200" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">AUEB NLP Group at ImageCLEFmedical Caption</title>
		<author>
			<persName><forename type="first">F</forename><surname>Charalampakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Zachariadis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karatzas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Trakas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2022 Working Notes, CEUR Workshop Proceedings, CEUR-WS.or</title>
				<meeting><address><addrLine>Bologna, Italy</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022. 2022</date>
			<biblScope unit="page" from="1355" to="1373" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">AUEB NLP Group at ImageCLEFmedical Caption</title>
		<author>
			<persName><forename type="first">P</forename><surname>Kaliosis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Charalampakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<ptr target="org" />
	</analytic>
	<monogr>
		<title level="m">CLEF2023 Working Notes, CEUR Workshop Proceedings, CEUR-WS</title>
				<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023. 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">The Unified Medical Language System (UMLS): integrating biomedical terminology</title>
		<author>
			<persName><forename type="first">O</forename><surname>Bodenreider</surname></persName>
		</author>
		<idno type="DOI">10.1093/nar/gkh061</idno>
	</analytic>
	<monogr>
		<title level="j">Nucleic acids research</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="D267" to="D270" />
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">B</forename><surname>Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G S</forename><surname>De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">A</forename><surname>Horn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<idno type="DOI">10.1038/s41597-024-03496-6</idno>
		<ptr target="https://arxiv.org/abs/2405.10004v1.doi:10.1038/s41597-024-03496-6" />
		<title level="m">ROCOv2: Radiology Objects in COntext Version 2, an Updated Multimodal Image Dataset, Scientific Data</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Fine-Tuning CNN Image Retrieval with No Human Annotation</title>
		<author>
			<persName><forename type="first">F</forename><surname>Radenović</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Tolias</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Chum</surname></persName>
		</author>
		<idno type="DOI">10.1109/TPAMI.2018.2846566</idno>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Pattern Analysis and Machine Intelligence</title>
		<imprint>
			<biblScope unit="volume">41</biblScope>
			<biblScope unit="page" from="1655" to="1668" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">Adam: A Method for Stochastic Optimization</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">P</forename><surname>Kingma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">L</forename><surname>Ba</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">3rd International Conference on Learning Representations, ICLR 2015</title>
				<meeting><address><addrLine>San Diego, CA, USA</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2015">May 7-9, 2015. 2015</date>
		</imprint>
	</monogr>
	<note>Conference Track Proceedings</note>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">A Ranking-based KNN Approach for Multi-Label Classification</title>
		<author>
			<persName><forename type="first">T.-H</forename><surname>Chiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H.-Y</forename><surname>Lo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S.-D</forename><surname>Lin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Asian Conference on Machine Learning</title>
				<meeting>the Asian Conference on Machine Learning<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="volume">25</biblScope>
			<biblScope unit="page" from="81" to="96" />
		</imprint>
		<respStmt>
			<orgName>Singapore Management University</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Eiben</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Smith</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-3-662-44874-8</idno>
		<title level="m">Introduction to Evolutionary Computing</title>
				<imprint>
			<publisher>Springer Publishing Company, Incorporated</publisher>
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
	<note>2nd ed</note>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">Finetuned Language Models Are Zero-Shot Learners</title>
		<author>
			<persName><forename type="first">J</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bosma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Guu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">W</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Lester</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Du</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Dai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><forename type="middle">V</forename><surname>Le</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2109.01652</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<analytic>
		<title level="a" type="main">BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</title>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Savarese</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">C H</forename><surname>Hoi</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2301.12597</idno>
		<ptr target="https://api.semanticscholar.org/CorpusID:256390509.doi:10.48550/arXiv.2301.12597" />
	</analytic>
	<monogr>
		<title level="m">International Conference on Machine Learning</title>
				<imprint>
			<date type="published" when="2023">2023. 2024-06-20</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xiong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Jia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Bi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Dai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2312.10997</idno>
		<title level="m">Retrieval-Augmented Generation for Large Language Models: A Survey</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b31">
	<analytic>
		<title level="a" type="main">Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks</title>
		<author>
			<persName><forename type="first">P</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Perez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piktus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Petroni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karpukhin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Kuttler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>-T. Yih</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Rocktäschel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Riedel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kiela</surname></persName>
		</author>
		<idno>abs/2005.11401</idno>
	</analytic>
	<monogr>
		<title level="j">Neural Information Processing Systems</title>
		<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<monogr>
		<title level="m" type="main">A Survey on Retrieval-Augmented Text Generation for Large Language Models</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Huang</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2404.10981</idno>
		<idno type="arXiv">arXiv:2404.10981</idno>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<analytic>
		<title level="a" type="main">Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer</title>
		<author>
			<persName><forename type="first">C</forename><surname>Raffel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">M</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Matena</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">J</forename><surname>Liu</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.1910.10683</idno>
	</analytic>
	<monogr>
		<title level="j">Journal of machine learning research</title>
		<imprint>
			<biblScope unit="volume">21</biblScope>
			<biblScope unit="page">67</biblScope>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<monogr>
		<title level="m" type="main">Exploring Uni-modal, Multi-modal and Few-Shot Deep Learning Methods for Diagnostic Captioning</title>
		<author>
			<persName><forename type="first">P</forename><surname>Kaliosis</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
		<respStmt>
			<orgName>Department of Informatics, Athens University of Economics and Business</orgName>
		</respStmt>
	</monogr>
	<note type="report_type">M.Sc. thesis</note>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">BERTScore: Evaluating text generation with BERT</title>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kishore</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">Q</forename><surname>Weinberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Artzi</surname></persName>
		</author>
		<idno>abs/1904.09675</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<analytic>
		<title level="a" type="main">ROUGE: A Package for Automatic Evaluation of Summaries</title>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/W04-1013" />
	</analytic>
	<monogr>
		<title level="m">Text Summarization Branches Out, Association for Computational Linguistics</title>
				<meeting><address><addrLine>Barcelona, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2004">2004. 2024-06-20</date>
			<biblScope unit="page" from="74" to="81" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b37">
	<analytic>
		<title level="a" type="main">BLEU: a Method for Automatic Evaluation of Machine Translation</title>
		<author>
			<persName><forename type="first">K</forename><surname>Papineni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Roukos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Ward</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-J</forename><surname>Zhu</surname></persName>
		</author>
		<idno type="DOI">10.3115/1073083.1073135</idno>
		<ptr target="https://aclanthology.org/P02-1040.doi:10.3115/1073083.1073135" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">P</forename><surname>Isabelle</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">E</forename><surname>Charniak</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">D</forename><surname>Lin</surname></persName>
		</editor>
		<meeting>the 40th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics<address><addrLine>Philadelphia, Pennsylvania, USA</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2002">2002. 2024-06-20</date>
			<biblScope unit="page" from="311" to="318" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b38">
	<analytic>
		<title level="a" type="main">BLEURT: Learning Robust Metrics for Text Generation</title>
		<author>
			<persName><forename type="first">T</forename><surname>Sellam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Das</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Parikh</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2020.acl-main.704</idno>
		<ptr target="https://aclanthology.org/2020.acl-main.704.doi:10.18653/v1/2020.acl-main.704" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">D</forename><surname>Jurafsky</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Chai</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Schluter</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Tetreault</surname></persName>
		</editor>
		<meeting>the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020. 2024-06-20</date>
			<biblScope unit="page" from="7881" to="7892" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b39">
	<analytic>
		<title level="a" type="main">METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments</title>
		<author>
			<persName><forename type="first">S</forename><surname>Banerjee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lavie</surname></persName>
		</author>
		<idno type="DOI">10.3115/1626355.1626389</idno>
		<ptr target="https://aclanthology.org/W05-0909.doi:10.3115/1626355.1626389" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">J</forename><surname>Goldstein</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Lavie</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Voss</surname></persName>
		</editor>
		<meeting>the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization, Association for Computational Linguistics<address><addrLine>Ann Arbor, Michigan</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2005">2005. 2024-06-20</date>
			<biblScope unit="page" from="65" to="72" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b40">
	<analytic>
		<title level="a" type="main">Training language models to follow instructions with human feedback</title>
		<author>
			<persName><forename type="first">L</forename><surname>Ouyang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Almeida</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">L</forename><surname>Wainwright</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Mishkin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Slama</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schulman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Hilton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Kelton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">E</forename><surname>Miller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Simens</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Askell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Welinder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Christiano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Leike</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">J</forename><surname>Lowe</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2203.02155</idno>
	</analytic>
	<monogr>
		<title level="j">Neural Information Processing Systems</title>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b41">
	<analytic>
		<title level="a" type="main">Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks</title>
		<author>
			<persName><forename type="first">P</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Perez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Piktus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Petroni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Karpukhin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Küttler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>-T. Yih</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Rocktäschel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Riedel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kiela</surname></persName>
		</author>
		<idno type="DOI">10.48550/arXiv.2005.11401</idno>
	</analytic>
	<monogr>
		<title level="m">Advances in Neural Information Processing Systems</title>
				<imprint>
			<publisher>Curran Associates, Inc</publisher>
			<date type="published" when="2020">2020</date>
			<biblScope unit="volume">33</biblScope>
			<biblScope unit="page" from="9459" to="9474" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
