<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">UIT-DarkCow team at ImageCLEFmedical Caption 2024: Diagnostic Captioning for Radiology Images Efficiency with Transformer Models</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Quan</forename><surname>Van Nguyen</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Huy</forename><forename type="middle">Quang</forename><surname>Pham</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Dan</forename><forename type="middle">Quang</forename><surname>Tran</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Thang</forename><surname>Kien-Bao</surname></persName>
						</author>
						<author>
							<persName><surname>Nguyen</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Nhat-Hao</forename><surname>Nguyen-Dang</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author role="corresp">
							<persName><forename type="first">Thien</forename><forename type="middle">B</forename><surname>Nguyen-Tat</surname></persName>
							<email>thienntb@uit.edu.vn</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">UIT-DarkCow team at ImageCLEFmedical Caption 2024: Diagnostic Captioning for Radiology Images Efficiency with Transformer Models</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">066F92F4448135701A282554357A159B</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:55+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>ImageCLEF, Computer Vision, Diagnostic Captioning, Image Captioning, Image Understanding, Radiology Images, Transformer Models, Encoder-Decoder, Query Transformer Nguyen-Tat) 0009-0000-3604-4679 (Q. V. Nguyen)</term>
					<term>0009-0006-5815-4469 (H. Q. Pham)</term>
					<term>0009-0003-8806-5289 (D. Q. Tran)</term>
					<term>0009-0009-6456-4247 (T. K. Nguyen)</term>
					<term>0009-0007-6405-7603 (N. Nguyen-Dang)</term>
					<term>0000-0002-4809-7126 (T. B. Nguyen-Tat)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Purpose: This study focuses on the development of automated text generation from radiology images, termed diagnostic captioning, to assist medical professionals in reducing clinical errors and improving productivity. The aim is to provide tools that enhance report quality and efficiency, which can significantly impact both clinical practice and deep learning research in the biomedical field. Methods: In our participation in the ImageCLEFmedical2024 Caption evaluation campaign, we explored caption prediction tasks using advanced Transformer-based models. We developed methods incorporating Transformer encoder-decoder and Query Transformer architectures. These models were trained and evaluated to generate diagnostic captions from radiology images. Results: Experimental evaluations demonstrated the effectiveness of our models, with the VisionDiagnostor-BioBART model achieving the highest BERTScore of 0.6267. This performance contributed to our team, DarkCow, achieving third place on the leaderboard. Our source code is public at this link. Conclusion: Our diagnostic captioning models show great promise in aiding medical professionals by generating high-quality reports efficiently. This approach can facilitate better data processing and performance optimization in medical imaging departments, ultimately benefiting healthcare delivery.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Machine learning, especially Deep Learning, is creating breakthroughs in many different fields, and its impact on biomedicine is remarkable. With the exponential growth of biomedical data, researchers are exploring its potential in biomedical engineering, advanced computing, imaging systems, and biomedical data mining algorithms based on machine learning <ref type="bibr" target="#b0">[1]</ref>. One important area is Diagnostic Captioning. Diagnostic Captioning is the process of automatically generating diagnostic text based on a set of medical images collected during a medical examination. It can assist less experienced physicians by minimizing clinical errors and helping experienced physicians generate diagnostic reports faster <ref type="bibr" target="#b1">[2]</ref>.</p><p>ImageCLEF is an annual multimodal machine learning campaign, part of the Cross-Language Evaluation Forum (CLEF), which has been running since 2003. It encourages breakthroughs in research and development of processing systems. Advanced multimedia processing in computer vision, image analysis, classification and retrieval in a multilingual, multimodal context. This year, one of ImageCLEF's four main missions is ImageCLEFMedical, which includes a series of challenges from annotating images to creating synthetic images and answering questions. In ImageCLEF 2024 <ref type="bibr" target="#b2">[3]</ref>, we took part in the ImageCLEFmedical Caption task <ref type="bibr" target="#b3">[4]</ref>. As in previous years, this task comprised two subtasks: concept detection and caption prediction.</p><p>Concept detection aims to associate biomedical images with related medical concepts while captioning prediction focuses on automatically generating preliminary diagnostic reports that accurately describe medical conditions and structures and anatomy shown in images. Concept detection also supports diagnostic notes by identifying key concepts that should be included in the preliminary report. Additionally, it can be used to index medical images according to related concepts, facilitating more efficient organization and retrieval.</p><p>Captioning prediction, in other words, diagnostic captioning, remains a challenging research problem, designed to support the diagnostic process by providing a preliminary report rather than replacing the physicians and human factors involved <ref type="bibr" target="#b1">[2]</ref>. It is designed as a tool to assist in generating an initial diagnostic report of a patient's condition, helping doctors focus on important areas of the image <ref type="bibr" target="#b4">[5]</ref> and assisting them in making diagnoses. Guess more accurately quickly <ref type="bibr" target="#b5">[6]</ref>. This approach can increase the efficiency of experienced clinicians, allowing them to handle high volumes of daily medical examinations more quickly and efficiently. For less experienced clinicians, automated annotation can help reduce the likelihood of clinical errors <ref type="bibr" target="#b6">[7]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.1.">DarkCow Team Contributions</head><p>In this paper, we presented the experiments and the systems that were submitted by our DarkCow team in this year's caption prediction task, which helped us secure third place on the leaderboard (see Table <ref type="table" target="#tab_0">1</ref>). Our new approaches build on the rapid development of deep learning techniques, especially the Transformer <ref type="bibr" target="#b7">[8]</ref> encoder-decoder architecture and the Query Transformer <ref type="bibr" target="#b8">[9]</ref> for Large Language Model <ref type="bibr" target="#b9">[10]</ref>. We leveraged the Vision Transformer (ViT) to extract visual features from radiology images. To optimize the use of information, we also used VinVL <ref type="bibr" target="#b10">[11]</ref> to extract features of objects in the images. Our first approach is based on encoder-decoder architecture to generate image captions. In the second approach, we leveraged Query Transformer to help LLM understand images. We also conducted experiments with image pre-processing, caption length, and object features to analyze the impact of those aspects. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Background and Related Works</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Radiology Techniques</head><p>With the continuous advancement of imaging technology, medical imaging diagnosis has evolved from a supplementary examination tool to the most important clinical diagnostic and differential diagnostic method in modern medicine. Radiology techniques are used to scan images within the body, which are then interpreted and reported by radiologists to specialists <ref type="bibr" target="#b11">[12]</ref>. With advancements in imaging technology, various imaging diagnostic methods have been developed, each with its own advantages and limitations. For example, X-ray imaging <ref type="bibr" target="#b12">[13]</ref> offers non-invasive, quick, and painless imaging, but it involves exposure to ionizing radiation, which increases the risk of developing cancer later in life. On the other hand, MRI imaging <ref type="bibr" target="#b13">[14]</ref> provides non-ionizing radiation and high spatial resolution, but it has relatively low sensitivity and longer scanning times, etc.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Former Medical Image Captioning Datasets</head><p>Medical imaging diagnosis today plays an incredibly important role in both the healthcare and information technology sectors. It not only aids in diagnosis and increases understanding of diseases but also holds immense potential in improving healthcare delivery and enhancing quality of life. The application of deep learning in medical image captioning in an era where AI is ubiquitous is evident; it automates the annotation process and significantly accelerates image analysis. Several datasets have been created to facilitate the training of medical image captioning tasks such as ROCO <ref type="bibr" target="#b14">[15]</ref>, PadChest <ref type="bibr" target="#b15">[16]</ref>, MIMIC-CXR <ref type="bibr" target="#b16">[17]</ref>, IU X-Ray <ref type="bibr" target="#b17">[18]</ref>, and MedICaT <ref type="bibr" target="#b18">[19]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Related Work Methods</head><p>For the task of medical image captioning, various methods have been developed, with pioneering work in applying the CNN-RNN encoder-decoder approach to generate captions from medical images conducted by Shin et al. <ref type="bibr" target="#b4">[5]</ref>. They utilized either the Network-in-Network or GoogLeNet architectures as encoding models, followed by LSTM <ref type="bibr" target="#b19">[20]</ref> or GRU <ref type="bibr" target="#b20">[21]</ref> as the decoding RNN to translate the encoded images into descriptive captions. In the process of translating images into biomedical text, MDNET <ref type="bibr" target="#b21">[22]</ref> made a notable advancement by incorporating an attention mechanism. This model employs RESNET for image encoding, extending its skip connections to mitigate gradient vanishing.</p><p>In recent studies by Wang et al. <ref type="bibr" target="#b22">[23]</ref>, Kougia et al. <ref type="bibr" target="#b23">[24]</ref>, and Li et al. <ref type="bibr" target="#b24">[25]</ref>, a fusion of generative models and retrieval systems for Medical Image Captioning (MIC) has been explored. For instance, Wang et al. <ref type="bibr" target="#b22">[23]</ref> proposed an approach that alternates between template retrieval and sentence generation for rare abnormal descriptions. This method relies on a contextual relational-topic encoder derived from visual and textual features, facilitating semantic consistency through hybrid knowledge co-reasoning. Additionally, Kougia et al. <ref type="bibr" target="#b23">[24]</ref> from AUEB NLP group presented various systems for the Image-CLEFmed 2019 Caption task. One approach utilized a retrieval-based model that leverages visual features to retrieve the most similar images based on cosine similarity, combining their concepts to predict relevant captions. Another system incorporated CheXNet <ref type="bibr" target="#b25">[26]</ref> with enhanced classification labels, employing a CNN encoder and a feed-forward neural network (FFNN) for multi-label classification. They also suggested an ensemble model by combining these systems, computing scores for returned concepts and merging them with image similarity scores to select the most relevant concepts.</p><p>Large language models (LLMs) have catalyzed significant progress in medical question answering; Med-PaLM <ref type="bibr" target="#b26">[27]</ref> was the first model to exceed a "passing" score in US Medical Licensing Examination (USMLE). However, this and other prior work suggested significant room for improvement, especially when models' answers were compared to clinicians' answers. Med-PaLM 2 <ref type="bibr" target="#b27">[28]</ref> bridges these gaps by leveraging a combination of base LLM improvements, medical domain finetuning, and prompting strategies including a novel ensemble refinement approach.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Dataset</head><p>Thanks to AUEB NLP Group for providing an excellent analysis of the dataset in the study of Kaliosis et al. <ref type="bibr" target="#b28">[29]</ref>. When comparing ImageCLEFmedical2023 data with ImageCLEFmedical2024, we found no significant differences in the task of caption prediction. Therefore, we decided to reapply to analyze the dataset in this section. This year's ImageCLEFmedical Caption task provided a dataset that includes 70,108 radiology images in the training set, each annotated with medical concepts using UMLS terms and diagnostic captions. The organizers initially divided the dataset into training and validation subsets <ref type="bibr" target="#b29">[30]</ref>. Building on previous campaigns, this year's dataset is an updated and expanded version of the Radiology Objects in Context (ROCO) dataset, which is sourced from a variety of biomedical studies in the PubMed Central OpenAccess (PMC OA) subset. The dataset used for the caption prediction task includes images from different modalities, such as X-ray and Computed Tomography (CT), although specific details about the image types were not provided. The goal of the caption prediction task is to generate open-ended diagnostic texts for the medical images (see Figure <ref type="figure" target="#fig_0">1</ref>).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 2</head><p>The ten most common words and their frequencies in the ImageCLEFmedical2024 train set.  Preoperative CT scan. 9</p><p>In the Caption prediction sub-task, each image has a diagnostic caption describing the described medical condition. There are a total of 69,743 captions in the training dataset and 9,959 captions in the validation dataset, one for each image. Similar to last year's campaign, the majority of captions (99.47%, or 69,743 out of 70,108) were unique. This is a notable difference from previous versions of the quest, where the uniqueness percentage was much lower. As a result, traditional retrieval methods based on nearest neighbor search are less efficient this year, including variants with a weighting mechanism based on the cosine similarity of the retrieved images. Therefore, more complex methods of creating subtitles are needed.  We observed that the maximum number of words in a single caption is 848 (occurred once), while the minimum is 1 (encountered 1 time). The average caption length is 20.84 words. These statistics apply to the entire dataset ( training set and valid set). The five most common captions, as well as the ten most popular words (excluding stopwords), can be found in Tables <ref type="table" target="#tab_2">3 and 2</ref>, respectively. In Figure <ref type="figure" target="#fig_1">2</ref> and Figure <ref type="figure" target="#fig_2">3</ref>, we present a distribution caption length of the training and valid sets, both indicating that the majority of captions contain fewer than 100 words. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Image Pre-processing 4.1. Denoising</head><p>Denoising is crucial in enhancing image quality by reducing the noise while preserving the important details. Noise in medical images can come from various sources, such as sensor imperfections, poor scan conditions, or inherent patient movements during image acquisition.</p><p>The smoothness of images is controlled through the utilization of a Gaussian filter with a fixed kernel size. The Gaussian filter operates by smoothing images using a technique called convolution. It employs a Gaussian kernel -a matrix based on the Gaussian function to adjust pixel values. This kernel is applied over each pixel in the image, averaging the pixel values in its vicinity, weighted by their distance from the central pixel. The standard deviation 𝜎 of the Gaussian determines the amount of blurring: a larger 𝜎 results in more blurring, smoothing out more details and noise. This process helps in reducing noise and is often used as a preparatory step in image processing tasks to enhance image quality without losing critical structural details (see Figure <ref type="figure" target="#fig_3">4</ref>).</p><p>The 2-D Gaussian function is given by:</p><formula xml:id="formula_0">𝐺(𝑥, 𝑦) = 1 2𝜋𝜎 2 𝑒 − 𝑥 2 +𝑦 2 2𝜎 2 (1)</formula><p>Medical image enhancement is one of the most widely used medical image processing techniques in medical domain. Its purpose is to improve the visual effect of the image and facilitate the analysis and understanding of the image by humans or machines. The Laplace transform and the Sobel gradient operator are two common ways of performing edge detection, image sharpening, and enabling the enhancement of the image (see Figure <ref type="figure">5</ref>).</p><p>Step 1 Laplace Transform: Apply the Laplace transform to enhance contrast by emphasizing areas of rapid intensity change in the original image.</p><p>Step 2 Sobel Operator: Use the Sobel operator to enhance the edges of the image. This step also helps to smooth out noise, making the edges clearer and more cohesive.</p><p>Step 3 Smoothing: Smooth the image processed by the Sobel operator using a 3x3 mean filter. This step increases the contrast of the edges against the background.</p><p>Step 4 Dot Product: Intensify the contrast by performing a dot product of the smoothed image with the result from the Laplace transform from step 1.</p><p>Step 5 Addition for Final Sharpening: Enhance the sharpness and visibility of detail by adding the result of the dot product back to the original image.</p><p>Step 6 Histogram Equalization: Apply histogram equalization to distribute the histogram of the image uniformly, improving the overall contrast and making fine details more visible.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Image Enhancement</head><p>Figure <ref type="figure">5</ref>: The image after a series of processing.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Proposed Method</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.">Encoder-Decoder Approach</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.1.">Features Embedding</head><p>We propose VisionDiagnostor centers around the implementation of Transformer encoder-decoder approach and deployed to evaluate methods having ClinicalT5 <ref type="bibr" target="#b30">[31]</ref> and BioBART <ref type="bibr" target="#b31">[32]</ref> as encoderdecoder module (see Figure <ref type="figure" target="#fig_5">6</ref>). ClinicalT5, based on the T5 <ref type="bibr" target="#b32">[33]</ref> architecture, and BioBART, a variant of the BART <ref type="bibr" target="#b33">[34]</ref> architecture, have both been pre-trained on large of biomedical text data. These models stand out as the preeminent and potent pre-trained language models for the medical domain, ensuring the efficacy and robustness of our proposed method.</p><p>Object features: To extract object features in an image, we used the VinVL model to extract object features 𝑅 = {𝑟 1 , 𝑟 2 , ..., 𝑟 𝑘 } from an image, with each 𝑟 𝑖 being a 2048-dimensional vector. Bounding box coordinates are normalized as</p><formula xml:id="formula_1">𝑏 𝑖 = [︁ 𝑥 𝑚𝑖𝑛 𝑖 𝑤 , 𝑦 𝑚𝑖𝑛 𝑖 ℎ , 𝑥 𝑚𝑎𝑥 𝑖 𝑤 , 𝑦 𝑚𝑎𝑥 𝑖 ℎ</formula><p>]︁ , forming 𝐵 obj = {𝑏 1 , 𝑏 2 , ..., 𝑏 𝑘 }. Final object features 𝑉 obj are computed by projecting 𝑅 and 𝐵 obj to the language model dimension and summing the results:</p><formula xml:id="formula_2">𝑉 obj = 𝑅 ′ + 𝐵 ′ obj (2)</formula><p>We use ViT for visual feature extraction due to its ability to capture global information through its attention mechanism. By freezing ViT and projecting the last hidden state to match the language model's dimension, we obtain visual features 𝑉 .  The input embedding to the encoder-decoder module is:</p><formula xml:id="formula_3">Input = Concat(𝑉, 𝑉 obj )<label>(3)</label></formula><p>Where 𝑉 are the visual features from ViT, and 𝑉 obj are the VinVL region object features. The Concat(•) function concatenates these features.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.2.">Encoder-Decoder Module</head><p>In this task, we employed the Transformer encoder-decoder architecture, which is used in ClinicalT5 <ref type="bibr" target="#b30">[31]</ref> and BioBART <ref type="bibr" target="#b31">[32]</ref> for the encoder-decoder module of VisionDiagnostor. The encoder receives the input features and then passes them to the decoder to generate the output sentence. In the decoder, attention mechanisms are employed, directing focus to both the output of the encoder and the input of the decoder.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Encoder</head><p>Multi-Head Attention:</p><formula xml:id="formula_4">Attention (Enc) (𝑄, 𝐾, 𝑉 ) = softmax (︂ 𝑄𝐾 𝑇 √ 𝑑 𝑘 )︂ 𝑉<label>(4)</label></formula><p>where 𝑄, 𝐾, and 𝑉 are the query, key, and value matrices, and 𝑑 𝑘 is the dimensionality of the key vectors.</p><p>Encoder Feed-Forward Network:</p><formula xml:id="formula_5">FFN (Enc) (𝑥) = ReLU(𝑥𝑊 (Enc) 1 + 𝑏 (Enc) 1</formula><p>)𝑊</p><formula xml:id="formula_6">(Enc) 2 + 𝑏 (Enc) 2<label>(5)</label></formula><p>where</p><formula xml:id="formula_7">𝑊 (Enc) 1 , 𝑊<label>(Enc) 2</label></formula><p>, 𝑏</p><p>, and 𝑏 (Enc) 2 are learnable parameters.</p><p>Encoder Layer Normalization:</p><formula xml:id="formula_9">LayerNorm (Enc) (𝑥) = LN (Enc) (𝑥 + LayerNorm (Enc) (𝑥))<label>(6)</label></formula><p>where LN (Enc) is the layer normalization function.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Decoder</head><p>Decoder Self-Attention:</p><formula xml:id="formula_10">Attention (Dec) (𝑄, 𝐾, 𝑉 ) = softmax (︂ 𝑄𝐾 𝑇 √ 𝑑 𝑘 )︂ 𝑉<label>(7)</label></formula><p>where 𝑄, 𝐾, and 𝑉 are the query, key, and value matrices, and 𝑑 𝑘 is the dimensionality of the key vectors.</p><p>Decoder-Encoder Cross-Attention:</p><formula xml:id="formula_11">Attention (Dec) (𝑄, 𝐾, 𝑉 ) = softmax (︂ 𝑄𝐾 𝑇 √ 𝑑 𝑘 )︂ 𝑉 (<label>8</label></formula><formula xml:id="formula_12">)</formula><p>where 𝑄 comes from the decoder and 𝐾, 𝑉 come from the encoder.</p><p>Decoder Feed-Forward Network:</p><formula xml:id="formula_13">FFN (Dec) (𝑥) = ReLU(𝑥𝑊 (Dec) 1 + 𝑏 (Dec) 1</formula><p>)𝑊</p><formula xml:id="formula_14">(Dec) 2 + 𝑏 (Dec) 2<label>(9)</label></formula><p>where</p><formula xml:id="formula_15">𝑊 (Dec) 1 , 𝑊 (Dec) 2 , 𝑏<label>(Dec) 1</label></formula><p>, and 𝑏 (Dec) 2</p><p>are learnable parameters.</p><p>Decoder Layer Normalization:</p><formula xml:id="formula_16">LayerNorm (Dec) (𝑥) = LN (Dec) (𝑥 + LayerNorm (Dec) (𝑥))<label>(10)</label></formula><p>where LN (Dec) is the layer normalization function.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.2.">Query Transformer Approach</head><p>Inspired by the BLIP2 architecture <ref type="bibr" target="#b34">[35]</ref>, we leveraged the Query Transformer (Q-Former) module, which serves as the trainable intermediary between a fixed image encoder and a fixed Large Language Model. It extracts a consistent number of output features from the image encoder, irrespective of the input image resolution. Q-Former comprises two transformer submodules that share self-attention layers: an image transformer for visual feature extraction from the fixed image encoder and a text transformer acting as both an encoder and decoder. We initialize a set number of learnable query embeddings as input to the image transformer. These queries engage in self-attention and cross-attention interactions with each other and the frozen image features. Additionally, they can interact with the text through self-attention layers, with different attention masks applied based on the pre-training task.</p><p>In our experiments, we employ 64 queries, each with a dimensionality of 768, matching the hidden dimension of Q-Former. We utilize VIT-huge <ref type="bibr" target="#b35">[36]</ref> as the frozen image encoder and BioMistral-7B <ref type="bibr" target="#b36">[37]</ref> as the frozen LLM for caption generation, and we call it VisionDiagnostor-Q-BioMistral which is depicted in Figure <ref type="figure" target="#fig_7">7</ref>. This bottleneck architecture, combined with our pre-training objectives, compels the queries to extract visual information most pertinent to the accompanying text.  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.3.">Evaluation Metrics</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.3.1.">BERTScore</head><p>BERTScore is computed as proposed by Zhang et al. <ref type="bibr" target="#b37">[38]</ref>, where the cosine similarity of each hypothesis token 𝑗 with each token 𝑖 in the reference sentence is calculated using contextualized embeddings. Instead of using a time-consuming best-case matching approach, a greedy matching strategy is employed. The F1 measure is then calculated as follows:</p><formula xml:id="formula_17">𝑅 BERT = 1 |r| ∑︁ 𝑖∈r max 𝑗∈p cos(𝑖 ⃗ , 𝑗 ⃗ ),<label>(11)</label></formula><formula xml:id="formula_18">𝑃 BERT = 1 |p| ∑︁ 𝑗∈p max 𝑖∈r cos(𝑖 ⃗ , 𝑗 ⃗ ),<label>(12)</label></formula><formula xml:id="formula_19">BERTScore = 𝐹 BERT = 2 • 𝑃 BERT • 𝑅 BERT 𝑃 BERT + 𝑅 BERT .<label>(13)</label></formula><p>The BERTScore correlates better with human judgments for the tasks of image captioning and machine translation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.3.2.">Other Metrics</head><p>In addition to BERTScore, the competition also uses many other metrics such as ROUGE <ref type="bibr" target="#b38">[39]</ref>, BLEU-1 <ref type="bibr" target="#b39">[40]</ref>, BLEURT <ref type="bibr" target="#b40">[41]</ref>, METEOR <ref type="bibr" target="#b41">[42]</ref>, CIDEr <ref type="bibr" target="#b42">[43]</ref>, CLIPScore <ref type="bibr" target="#b43">[44]</ref>, RefCLIPScore <ref type="bibr" target="#b43">[44]</ref>, ClinicBLEURT <ref type="bibr" target="#b44">[45]</ref> and MedBERTScore <ref type="bibr" target="#b45">[46]</ref>. Applying a variety of these metrics helps us have a more accurate and general view of the model performance of participating teams. Each measure has its own advantages and provides a different perspective on text quality that makes it relevant in a medical context. This multi-dimensional evaluation helps identify outstanding models and gain an objective view of the competition.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Experiment Results</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.">Experimental Configuration</head><p>All our proposed methods were trained and fine-tuned using the Adam optimization <ref type="bibr" target="#b46">[47]</ref>. We utilized an A100-GPU setup with 80GB of memory to train models, taking 10 hours on average for each method.</p><p>We set the learning rate to 3e-05, dropout is set at 0.2, batch size is 32, and the training process is terminated after 3 epochs of not finding any reduction in the valid loss. Table <ref type="table" target="#tab_3">4</ref> presents a comprehensive of the results on the test set achieved by individual models, showcasing their BERTScore and other metrics. The findings underscore significant disparities in performance among the various models, providing valuable insights into their respective strengths and weaknesses. Notably, within the baseline models, VisionDiagnostor-BioBART stands out as the top performer, showcasing an impressive BERTScore of 0.6267 and almost all other metrics with the smallest size at 227M parameters. Moreover, Table <ref type="table" target="#tab_3">4</ref> demonstrates that using large-scale pre-trained models in VisionDiagnostor-Q-BioMistral with a very large size (8B) does not result in significant performance improvement in this task.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2.">Main Result</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.">Result Analysis</head><p>In this section, we conduct a subjective analysis of the valid set due to the limited number of submissions in the competition. This means that instead of using the test set for objective evaluation, we used the valid set to analyze the results our proposed methods achieved.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.1.">Impact of Image Pre-processing Table 5</head><p>Results of models with image pre-processing in valid set. △ indicates the increase (↑) or decrease (↓) and compares without pre-processing (*).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Model BERTScore</head><p>VisionDiagnostor-Q-BioMistral △ 0.6841* ↓ 0.0101 VisionDiagnostor-ClinicalT5 △ 0.7071* ↓ 0.0166 VisionDiagnostor-BioBART △ 0.7165* ↑ 0.0198 Table <ref type="table">5</ref> presents the results comparing the performance of the models with and without image preprocessing on the validation dataset, evaluated using BERTScore. Specifically, for the VisionDiagnostor-Q-BioMistral model, BERTScore decreased from 0.6841 to 0.6740 after applying pre-processing, corresponding to a decrease of 0.0101. Similarly, VisionDiagnostor-ClinicalT5 also saw a decrease in performance from 0.7071 to 0.6905, a decrease of 0.0166. In contrast, VisionDiagnostor-BioBART is the only model with an improvement with BERTScore increasing from 0.7165 to 0.7363, an increase of 0.0198.</p><p>Overall, applying image pre-processing does not appear to yield significant improvement for most models. Even for two of the three models (VisionDiagnostor-Q-BioMistral and VisionDiagnostor-ClinicalT5), image pre-processing degrades performance. The reason may be because of the input images are of good quality and have almost no noise. Some images also have clear instructions, such as arrows pointing to the relevant caption of the image (see Figure <ref type="figure" target="#fig_0">1</ref> in Section 3), making it easy for the model to understand and process the content without additional pre-processing. The details of the test set based on different groups of lengths are in Table <ref type="table" target="#tab_4">6</ref>. Classification is done as follows:</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.2.">Impact of Caption Length</head><p>• Short caption: These are captions shorter than 21 words.  Specifically, the two models VisionDiagnostor-ClinicalT5 and VisionDiagnostor-BioBART based on the encoder-decoder method have similar trends, both showing a gradual decrease in BERTScore as the caption length increases. This may indicate a limitation in handling longer captions with this method.</p><p>It is worth noting that the VisionDiagnostor-Q-BioMistral model represents a different case, with performance increasing as the caption length increases. This may imply that this model is capable of handling longer captions more efficiently than other models, possibly due to its complexity and magnitude.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7.3.">Impact of Object Features</head><p>According to papers from competing teams in previous years <ref type="bibr" target="#b47">[48]</ref>, <ref type="bibr" target="#b48">[49]</ref>, <ref type="bibr" target="#b28">[29]</ref>, the most popular image feature extraction methods today have two main directions: convolutional neural networks (CNN) and Vision transformers (ViT). Studies and demonstrations have shown that ViT often gives better results than CNN in the task of image captioning. ViT is capable of capturing long-term and global relationships in images more effectively, leading to the creation of richer and more accurate captions. However, to improve the quality of feature extraction further, we used the VinVL model. VinVL takes advantage of the power of the ability to detect and represent objects in images in detail. This allows the model to gain a deeper understanding of the context and elements in the image, thereby creating more accurate captions.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 7</head><p>Results of models with image pre-processing in valid set. △ indicates the increase (↑) or decrease (↓) and compares with models using object features (*).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Model BERTScore</head><p>VisionDiagnostor-ClinicalT5 △ 0.7071* ↓ 0.0239 VisionDiagnostor-BioBART △ 0.7165* ↓ 0.0321 Table <ref type="table">7</ref> presents the results of the models when not using object features in the valid set. The figures show that not using object features significantly reduced the performance of the models.</p><p>Specifically, the VisionDiagnostor-ClinicalT5 model has a BERTScore of 0.7071 when using object features. However, when not using object features, the performance of this model drops by 0.0239. Similarly, the VisionDiagnostor-BioBART model also shows a significant decrease when not using object features, with BERTScore decreasing from 0.7165 to 0.6844, corresponding to a decrease of 0.0321.</p><p>These results indicate that using object features has an important effect in improving model performance. Object features can provide detailed and characteristic information about objects in images, helping models understand and describe images more accurately. Removing object features results in the loss of important information, reducing the model's ability to produce accurate and detailed captions, which in turn reduces BERTScore significantly.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="8.">Conclusion and Future Works</head><p>In this paper, we have proposed three different models to solve the task of medical image captioning, in other words medical image diagnosis, including VisionDiagnostor-ClinicalT5 and VisionDiagnostor-BioBART based on encoder-decoder architecture, VisionDiagnostor-Q-BioMistral based on BLIP2 architecture with Query Transformer which leveraging the power of Large Language Models (LLM).</p><p>Our results show that the VisionDiagnostor-BioBART model achieved third place on the leaderboard, with the highest BERTScore of 0.6267, despite being the smallest in size with only 227M parameters. Additionally, we performed analysis of the results to gain a deeper understanding of the factors that influence the performance of the models, including image pre-processing, caption length, and object features. These analyses have provided the comprehensive insight needed to shape and improve future methods and models for this task.</p><p>In future works, our objective is to delve deeper into the applications of other biomedical large language models (LLMs) BioMedLM <ref type="bibr" target="#b49">[50]</ref>, BioGPT <ref type="bibr" target="#b50">[51]</ref>, especially focusing on enhancing their capabilities to generate precise captions that are context-sensitive. This development will be pursued through methods like instruction tuning and better alignment of the models with specific user requirements. In addition, we plan to explore the integration of dense retrieval techniques into the biomedical image captioning process <ref type="bibr" target="#b51">[52]</ref>. By adopting frameworks akin to Retrieval Augmented Generation, we intend to supplement the LLMs with an external, non-parametric memory using a FAISS index <ref type="bibr" target="#b52">[53]</ref>, thereby enriching their reasoning capabilities. Another area of interest will be investigating the interconnections between these approaches. We also anticipate evaluating the qualitative variations in the captions generated through these different methodologies to ascertain their efficacy and practicality in real-world applications.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Several images from the ImageCLEFmedical2024 dataset.</figDesc><graphic coords="4,72.00,65.60,451.28,248.60" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Distribution of caption lengths in the training set.</figDesc><graphic coords="5,117.13,142.84,361.02,186.92" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 :</head><label>3</label><figDesc>Figure 3: Distribution of caption lengths in the valid set. [30]</figDesc><graphic coords="5,117.13,381.14,361.03,188.60" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 4 :</head><label>4</label><figDesc>Figure 4: Application of Gaussian filter.</figDesc><graphic coords="6,72.00,65.61,451.28,215.41" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head></head><label></label><figDesc>Peres et al. (2017)]</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 6 :</head><label>6</label><figDesc>Figure 6: Overview of VisionDiagnostor.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6"><head></head><label></label><figDesc>Computed tomography of the head on Day 22 shows dilated left lateral ventricle with parenchymal hemorrhage in the right frontal lobe (black arrows) and intraventricular hemorrhage (white arrow) despite ventriculostomy tubes. Kim et al. (2021)]</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_7"><head>Figure 7 :</head><label>7</label><figDesc>Figure 7: Overview of VisionDiagnostor-Q.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8"><head>Figure 8 :</head><label>8</label><figDesc>Figure 8: The results of models based on caption length.</figDesc><graphic coords="12,139.69,359.82,315.89,173.00" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0"><head></head><label></label><figDesc></figDesc><graphic coords="7,72.00,167.99,451.27,233.36" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Caption prediction task scores, rankings are based on BERTScore</figDesc><table><row><cell>Team</cell><cell cols="3">ID BERTScore ROUGE</cell><cell cols="3">BLEU-1 BLEURT METEOR</cell><cell>CIDEr</cell><cell cols="4">CLIPScore RefCLIPScore ClinicalBLEURT MedBERTScore</cell></row><row><cell>pclmed</cell><cell>634</cell><cell>0.629913</cell><cell>0.272626</cell><cell>0.268994</cell><cell>0.337626</cell><cell>0.113264</cell><cell>0.268133</cell><cell>0.823614</cell><cell>0.817610</cell><cell>0.466557</cell><cell>0.632318</cell></row><row><cell>CS_Morgan</cell><cell>429</cell><cell>0.628059</cell><cell>0.250801</cell><cell>0.209298</cell><cell>0.317385</cell><cell>0.092682</cell><cell>0.245029</cell><cell>0.821262</cell><cell>0.815534</cell><cell>0.455942</cell><cell>0.632664</cell></row><row><cell>DarkCow</cell><cell>220</cell><cell>0.626720</cell><cell cols="5">0.245228 0.195044 0.306005 0.088897 0.224250</cell><cell>0.818440</cell><cell>0.811700</cell><cell>0.456199</cell><cell>0.629189</cell></row><row><cell>auebnlpgroup</cell><cell>630</cell><cell>0.621112</cell><cell>0.204883</cell><cell>0.111034</cell><cell>0.289907</cell><cell>0.068022</cell><cell>0.176923</cell><cell>0.804067</cell><cell>0.798684</cell><cell>0.486560</cell><cell>0.626134</cell></row><row><cell>2Q2T</cell><cell>643</cell><cell>0.617814</cell><cell>0.247755</cell><cell>0.221252</cell><cell>0.313942</cell><cell>0.098590</cell><cell>0.220037</cell><cell>0.827074</cell><cell>0.813756</cell><cell>0.475908</cell><cell>0.622447</cell></row><row><cell>MICLab</cell><cell>678</cell><cell>0.612850</cell><cell>0.213525</cell><cell>0.185269</cell><cell>0.306743</cell><cell>0.077181</cell><cell>0.158239</cell><cell>0.815925</cell><cell>0.804924</cell><cell>0.445257</cell><cell>0.617195</cell></row><row><cell>DLNU_CCSE</cell><cell>674</cell><cell>0.606578</cell><cell>0.217857</cell><cell>0.151179</cell><cell>0.283133</cell><cell>0.070419</cell><cell>0.168765</cell><cell>0.796707</cell><cell>0.790424</cell><cell>0.475625</cell><cell>0.612954</cell></row><row><cell>Kaprov</cell><cell>559</cell><cell>0.596362</cell><cell>0.190497</cell><cell>0.169726</cell><cell>0.295109</cell><cell>0.060896</cell><cell>0.107017</cell><cell>0.792183</cell><cell>0.787201</cell><cell>0.439971</cell><cell>0.608924</cell></row><row><cell>DS@BioMed</cell><cell>571</cell><cell>0.579438</cell><cell>0.103095</cell><cell>0.012144</cell><cell>0.220211</cell><cell>0.035335</cell><cell>0.071529</cell><cell>0.775566</cell><cell>0.774823</cell><cell>0.529529</cell><cell>0.580388</cell></row><row><cell>DBS-HHU</cell><cell>637</cell><cell>0.576891</cell><cell>0.153103</cell><cell>0.149275</cell><cell>0.270965</cell><cell>0.055929</cell><cell>0.064361</cell><cell>0.784199</cell><cell>0.774985</cell><cell>0.476634</cell><cell>0.588744</cell></row><row><cell cols="2">KDE-medical-caption 557</cell><cell>0.567329</cell><cell>0.132496</cell><cell>0.106025</cell><cell>0.256576</cell><cell>0.038628</cell><cell>0.038404</cell><cell>0.765059</cell><cell>0.760958</cell><cell>0.502234</cell><cell>0.569659</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Most common words (excluding stop-words) Word showing right left ct image chest scan computed tomography shows Occurrences</head><label></label><figDesc></figDesc><table><row><cell>22,519</cell><cell>18,258 18,136 15,167 10,245 10,082 9,296</cell><cell>9,273</cell><cell>8,969</cell><cell>8,600</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>The five most common captions found in the ImageCLEFmedical2024 train set alongside the number of images they are associated with.</figDesc><table><row><cell></cell><cell>Most common captions</cell><cell></cell></row><row><cell>Position</cell><cell>Caption</cell><cell>Occurences</cell></row><row><cell>1</cell><cell>Initial panoramic radiograph</cell><cell>40</cell></row><row><cell>2</cell><cell>Final panoramic radiograph</cell><cell>37</cell></row><row><cell>3</cell><cell>Chest X-ray</cell><cell>20</cell></row><row><cell>4</cell><cell>Chest radiograph</cell><cell>17</cell></row><row><cell>5</cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4</head><label>4</label><figDesc>Performance comparison of different models on test set, VD stands for VisionDiagnostor.</figDesc><table><row><cell>Model</cell><cell cols="11">Model Size BERTScore ROUGE BLEU-1 BLEURT METEOR CIDEr CLIPScore RefCLIPScore ClinicalBLEURT MedBERTScore</cell></row><row><cell>VD-Q-BioMistral</cell><cell>8B</cell><cell>0.6200</cell><cell>0.2139</cell><cell>0.1685</cell><cell>0.2913</cell><cell>0.0751</cell><cell>0.1585</cell><cell>0.8132</cell><cell>0.8014</cell><cell>0.4597</cell><cell>0.6233</cell></row><row><cell>VD-ClinicalT5</cell><cell>310M</cell><cell>0.5994</cell><cell>0.2363</cell><cell>0.2323</cell><cell>0.2954</cell><cell>0.0989</cell><cell>0.1442</cell><cell>0.8244</cell><cell>0.8100</cell><cell>0.4597</cell><cell>0.6016</cell></row><row><cell>VD-BioBART</cell><cell>227M</cell><cell>0.6267</cell><cell>0.2452</cell><cell>0.1950</cell><cell>0.3060</cell><cell>0.0889</cell><cell>0.2243</cell><cell>0.8184</cell><cell>0.8117</cell><cell>0.4562</cell><cell>0.6292</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 6</head><label>6</label><figDesc>Group of caption length in valid set.</figDesc><table><row><cell>Group</cell><cell cols="2">Length (n) Samples</cell></row><row><cell>Short</cell><cell>𝑛 ≤ 20</cell><cell>5,520</cell></row><row><cell>Medium</cell><cell>20 &lt; 𝑛 ≤ 25</cell><cell>2,179</cell></row><row><cell>Long</cell><cell>25 &lt; 𝑛 ≤ 30</cell><cell>1,339</cell></row><row><cell>Very long</cell><cell>𝑛 &gt; 30</cell><cell>934</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5"><head></head><label></label><figDesc>• Medium caption: This group includes captions from 21 to 25 words. • Long caption: Captions in this group from 26 to 30 words. • Very long caption: This group contains captions longer than 30 words.</figDesc><table /></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgment</head><p>This research is funded by University of Information Technology-Vietnam National University HoChiMinh City under grant number D4-2024-01.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Machine learning for metabolic engineering: A review</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">E</forename><surname>Lawson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Martí</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Radivojevic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">V R</forename><surname>Jonnalagadda</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Gentz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">J</forename><surname>Hillson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Peisert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">A</forename><surname>Simmons</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">J</forename><surname>Petzold</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Metabolic Engineering</title>
		<imprint>
			<biblScope unit="volume">63</biblScope>
			<biblScope unit="page" from="34" to="60" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Diagnostic captioning: a survey</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Papamichail</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Knowledge and Information Systems</title>
		<imprint>
			<biblScope unit="volume">64</biblScope>
			<biblScope unit="page" from="1691" to="1722" />
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEF 2024: Multimedia retrieval in medical applications</title>
		<author>
			<persName><forename type="first">B</forename><surname>Ionescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Drăgulinescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>García Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M G</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Andrei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Prokopchuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Karpenka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radzhabov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kovalev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Macaire</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schwab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Lecouteux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Esperança-Rodier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yetisgen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Hicks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Riegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Thambawita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Storås</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Halvorsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Heinrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kiesel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Potthast</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Stein</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Experimental IR Meets Multilinguality, Multimodality, and Interaction, Proceedings of the 15th International Conference of the CLEF Association (CLEF 2024</title>
		<title level="s">Springer Lecture Notes in Computer Science LNCS</title>
		<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEFmedical 2024 -Caption Prediction and Concept Detection</title>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M G</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2024 Working Notes, CEUR Workshop Proceedings</title>
				<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Learning to read chest x-rays: Recurrent neural cascade model for automated image annotation</title>
		<author>
			<persName><forename type="first">H.-C</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Demner-Fushman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Summers</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="2497" to="2506" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<monogr>
		<title level="m" type="main">Medical image captioning based on deep architectures</title>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">A survey on biomedical image captioning</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the second workshop on shortcomings in vision and language</title>
				<meeting>the second workshop on shortcomings in vision and language</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="26" to="36" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Attention is all you need</title>
		<author>
			<persName><forename type="first">A</forename><surname>Vaswani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Parmar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Jones</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">N</forename><surname>Gomez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Ł</forename><surname>Kaiser</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Polosukhin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Advances in neural information processing systems</title>
		<imprint>
			<biblScope unit="volume">30</biblScope>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Anchor detr: Query design for transformer-based detector</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sun</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the AAAI conference on artificial intelligence</title>
				<meeting>the AAAI conference on artificial intelligence</meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="volume">36</biblScope>
			<biblScope unit="page" from="2567" to="2575" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">X</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Hou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Min</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Dong</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2303.18223</idno>
		<title level="m">A survey of large language models</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Vinvl: Revisiting visual representations in vision-language models</title>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gao</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</title>
				<meeting>the IEEE/CVF conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="5579" to="5588" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">A comparative study of medical imaging techniques</title>
		<author>
			<persName><forename type="first">H</forename><surname>Kasban</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>El-Bendary</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Salama</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">International Journal of Information Science and Intelligent System</title>
		<imprint>
			<biblScope unit="volume">4</biblScope>
			<biblScope unit="page" from="37" to="58" />
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">X-ray imaging physics for nuclear medicine technologists. part 2: X-ray interactions and image formation</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Seibert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Boone</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of nuclear medicine technology</title>
		<imprint>
			<biblScope unit="volume">33</biblScope>
			<biblScope unit="page" from="3" to="18" />
			<date type="published" when="2005">2005</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Fully automatic segmentation of the brain in mri</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">S</forename><surname>Atkins</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">T</forename><surname>Mackiewich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE transactions on medical imaging</title>
		<imprint>
			<biblScope unit="volume">17</biblScope>
			<biblScope unit="page" from="98" to="107" />
			<date type="published" when="1998">1998</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Radiology objects in context (roco): a multimodal image dataset</title>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Intravascular Imaging and Computer Assisted Stenting and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis: 7th Joint International Workshop, CVII-STENT 2018 and Third International Workshop</title>
		<title level="s">Proceedings</title>
		<meeting><address><addrLine>LABELS; Granada, Spain</addrLine></address></meeting>
		<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2018-09-16">2018. September 16, 2018. 2018</date>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="180" to="189" />
		</imprint>
	</monogr>
	<note>Held in Conjunction with MICCAI 2018</note>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">-Vaya, Padchest: A large chest x-ray image dataset with multi-label annotated reports</title>
		<author>
			<persName><forename type="first">A</forename><surname>Bustos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pertusa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-M</forename><surname>Salinas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>De</surname></persName>
		</author>
		<author>
			<persName><forename type="first">La</forename><surname>Iglesia</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Medical image analysis</title>
		<imprint>
			<biblScope unit="volume">66</biblScope>
			<biblScope unit="page">101797</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">E</forename><surname>Johnson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">J</forename><surname>Pollard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">R</forename><surname>Greenbaum</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">P</forename><surname>Lungren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>-Y. Deng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Mark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">J</forename><surname>Berkowitz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Horng</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1901.07042</idno>
		<title level="m">Mimic-cxr-jpg, a large publicly available database of labeled chest radiographs</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">Chest x-ray caption generation with chexnet</title>
		<author>
			<persName><forename type="first">V</forename><surname>Wijerathna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Raveen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Abeygunawardhana</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">D</forename><surname>Ambegoda</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2022 Moratuwa Engineering Research Conference (MERCon), IEEE</title>
				<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="1" to="6" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Medicat: A dataset of medical images, captions, and textual references</title>
		<author>
			<persName><forename type="first">S</forename><surname>Subramanian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">L</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bogin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Van Zuylen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Parasa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Singh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Gardner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hajishirzi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: EMNLP</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<title level="m" type="main">Long short-term memory, Supervised sequence labelling with recurrent neural networks</title>
		<author>
			<persName><forename type="first">A</forename><surname>Graves</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Graves</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2012">2012</date>
			<biblScope unit="page" from="37" to="45" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">Gate-variants of gated recurrent unit (gru) neural networks</title>
		<author>
			<persName><forename type="first">R</forename><surname>Dey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">M</forename><surname>Salem</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE 60th international midwest symposium on circuits and systems (MWSCAS), IEEE</title>
				<imprint>
			<date type="published" when="2017">2017. 2017</date>
			<biblScope unit="page" from="1597" to="1600" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Mdnet: A semantically and visually interpretable medical image diagnosis network</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mcgough</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Yang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2017">2017</date>
			<biblScope unit="page" from="6428" to="6436" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Imagesem group at imageclefmed caption 2021 task: Exploring the clinical significance of the textual descriptions derived from medical images</title>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<ptr target="https://api.semanticscholar.org/CorpusID:237298727" />
	</analytic>
	<monogr>
		<title level="m">Conference and Labs of the Evaluation Forum</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">AUEB NLP group at imageclefmed caption</title>
		<author>
			<persName><forename type="first">V</forename><surname>Kougia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<ptr target="https://ceur-ws.org/Vol-2380/paper_136.pdf" />
	</analytic>
	<monogr>
		<title level="m">Working Notes of CLEF 2019 -Conference and Labs of the Evaluation Forum</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<editor>
			<persName><forename type="first">L</forename><surname>Cappellato</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Ferro</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">D</forename><forename type="middle">E</forename><surname>Losada</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</editor>
		<meeting><address><addrLine>Lugano, Switzerland</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2019-09-09">2019. September 9-12, 2019. 2019</date>
			<biblScope unit="volume">2380</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Hybrid retrieval-generation reinforced agent for medical image report generation</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">P</forename><surname>Xing</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Advances in Neural Information Processing Systems</title>
				<editor>
			<persName><forename type="first">S</forename><surname>Bengio</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Wallach</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Larochelle</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Grauman</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Cesa-Bianchi</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">R</forename><surname>Garnett</surname></persName>
		</editor>
		<imprint>
			<publisher>Curran Associates, Inc</publisher>
			<date type="published" when="2018">2018</date>
			<biblScope unit="volume">31</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<monogr>
		<author>
			<persName><forename type="first">P</forename><surname>Rajpurkar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Irvin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Zhu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Mehta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Duan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Ding</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bagul</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Langlotz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Shpanskaya</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1711.05225</idno>
		<title level="m">Chexnet: Radiologist-level pneumonia detection on chest x-rays with deep learning</title>
				<imprint>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b26">
	<analytic>
		<title level="a" type="main">Large language models encode clinical knowledge</title>
		<author>
			<persName><forename type="first">K</forename><surname>Singhal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Azizi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Tu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Nature</title>
		<imprint>
			<biblScope unit="volume">620</biblScope>
			<biblScope unit="page" from="172" to="180" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<monogr>
		<title level="m" type="main">Towards expert-level medical question answering with large language models</title>
		<author>
			<persName><forename type="first">K</forename><surname>Singhal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Tu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gottweis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Sayres</surname></persName>
		</author>
		<idno>arXiv 2305.09617</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">Aueb nlp group at imageclefmedical caption</title>
		<author>
			<persName><forename type="first">P</forename><surname>Kaliosis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Charalambakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Pavlopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Androutsopoulos</surname></persName>
		</author>
		<ptr target="org" />
	</analytic>
	<monogr>
		<title level="m">CLEF2023 Working Notes, CEUR Workshop Proceedings, CEUR-WS</title>
				<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023. 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">B</forename><surname>Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G S</forename><surname>De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">A</forename><surname>Horn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<ptr target="https://arxiv.org/abs/2405.10004v1.arXiv:2405.10004" />
		<title level="m">ROCOv2: Radiology Objects in COntext version 2, an updated multimodal image dataset</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<author>
			<persName><forename type="first">E</forename><surname>Lehman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Johnson</surname></persName>
		</author>
		<title level="m">Clinical-t5: Large language models built using mimic clinical text</title>
				<imprint>
			<publisher>PhysioNet</publisher>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b31">
	<monogr>
		<author>
			<persName><forename type="first">H</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Gan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yu</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2204.03905</idno>
		<title level="m">Biobart: Pretraining and evaluation of a biomedical generative language model</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Exploring the limits of transfer learning with a unified text-to-text transformer</title>
		<author>
			<persName><forename type="first">C</forename><surname>Raffel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Matena</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">J</forename><surname>Liu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of machine learning research</title>
		<imprint>
			<biblScope unit="volume">21</biblScope>
			<biblScope unit="page" from="1" to="67" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<monogr>
		<title level="m" type="main">Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</title>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ghazvininejad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mohamed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Stoyanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1910.13461</idno>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models</title>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Savarese</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hoi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International conference on machine learning</title>
				<meeting><address><addrLine>PMLR</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="19730" to="19742" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">Masked autoencoders are scalable vision learners</title>
		<author>
			<persName><forename type="first">K</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dollár</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Girshick</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</title>
				<meeting>the IEEE/CVF conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="16000" to="16009" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><surname>Labrak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Bazoge</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Morin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P.-A</forename><surname>Gourraud</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Rouvier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Dufour</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2402.10373</idno>
		<title level="m">Biomistral: A collection of open-source pretrained large language models for medical domains</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b37">
	<monogr>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kishore</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">Q</forename><surname>Weinberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Artzi</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1904.09675</idno>
		<title level="m">Bertscore: Evaluating text generation with bert</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b38">
	<analytic>
		<title level="a" type="main">Rouge: A package for automatic evaluation of summaries</title>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Text summarization branches out</title>
				<imprint>
			<date type="published" when="2004">2004</date>
			<biblScope unit="page" from="74" to="81" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b39">
	<analytic>
		<title level="a" type="main">Bleu: a method for automatic evaluation of machine translation</title>
		<author>
			<persName><forename type="first">K</forename><surname>Papineni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Roukos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Ward</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-J</forename><surname>Zhu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 40th annual meeting of the Association for Computational Linguistics</title>
				<meeting>the 40th annual meeting of the Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2002">2002</date>
			<biblScope unit="page" from="311" to="318" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b40">
	<analytic>
		<title level="a" type="main">Bleurt: Learning robust metrics for text generation</title>
		<author>
			<persName><forename type="first">T</forename><surname>Sellam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Das</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Parikh</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
				<meeting>the 58th Annual Meeting of the Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="7881" to="7892" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b41">
	<analytic>
		<title level="a" type="main">Meteor: An automatic metric for mt evaluation with improved correlation with human judgments</title>
		<author>
			<persName><forename type="first">S</forename><surname>Banerjee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lavie</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and/or summarization</title>
				<meeting>the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and/or summarization</meeting>
		<imprint>
			<date type="published" when="2005">2005</date>
			<biblScope unit="page" from="65" to="72" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b42">
	<analytic>
		<title level="a" type="main">Cider: Consensus-based image description evaluation</title>
		<author>
			<persName><forename type="first">R</forename><surname>Vedantam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Lawrence Zitnick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Parikh</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2015">2015</date>
			<biblScope unit="page" from="4566" to="4575" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b43">
	<analytic>
		<title level="a" type="main">Clipscore: A reference-free evaluation metric for image captioning</title>
		<author>
			<persName><forename type="first">J</forename><surname>Hessel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Holtzman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Forbes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">Le</forename><surname>Bras</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Choi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</title>
				<meeting>the 2021 Conference on Empirical Methods in Natural Language Processing</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="7514" to="7528" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b44">
	<monogr>
		<author>
			<persName><forename type="first">K</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Altosaar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Ranganath</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1904.05342</idno>
		<title level="m">Clinicalbert: Modeling clinical notes and predicting hospital readmission</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b45">
	<analytic>
		<title level="a" type="main">An investigation of evaluation methods in automatic medical note generation</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">B</forename><surname>Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-W</forename><surname>Yim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Michalopoulos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: ACL 2023</title>
				<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="2575" to="2588" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b46">
	<monogr>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">P</forename><surname>Kingma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ba</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1412.6980</idno>
		<title level="m">Adam: A method for stochastic optimization</title>
				<imprint>
			<date type="published" when="2014">2014</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b47">
	<analytic>
		<title level="a" type="main">A concise model for medical image captioning</title>
		<author>
			<persName><forename type="first">A</forename><surname>Nicolson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dowling</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Koopman</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2023 Working Notes</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
		<respStmt>
			<orgName>CEUR-WS. org</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b48">
	<analytic>
		<title level="a" type="main">Transferring pre-trained large language-image model for medical image captioning</title>
		<author>
			<persName><forename type="first">W</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Ye</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Yang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2023 Working Notes</title>
		<title level="s">CEUR Workshop Proceedings</title>
		<meeting><address><addrLine>Thessaloniki, Greece</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
		<respStmt>
			<orgName>CEUR-WS. org</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b49">
	<monogr>
		<author>
			<persName><forename type="first">E</forename><surname>Bolton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Venigalla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yasunaga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Hall</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Xiong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Daneshjou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Frankle</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Carbin</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2403.18421</idno>
		<title level="m">Biomedlm: A 2.7 b parameter language model trained on biomedical text</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b50">
	<analytic>
		<title level="a" type="main">Biogpt: generative pre-trained transformer for biomedical text generation and mining</title>
		<author>
			<persName><forename type="first">R</forename><surname>Luo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Qin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Poon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T.-Y</forename><surname>Liu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Briefings in bioinformatics</title>
		<imprint>
			<biblScope unit="volume">23</biblScope>
			<biblScope unit="page">409</biblScope>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b51">
	<analytic>
		<title level="a" type="main">Neuraldynamicslab at imageclefmedical</title>
		<author>
			<persName><forename type="first">G</forename><surname>Moschovis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Fransén</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF (Working Notes)</title>
				<imprint>
			<date type="published" when="2022">2022. 2022</date>
			<biblScope unit="page" from="1487" to="1504" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b52">
	<analytic>
		<title level="a" type="main">Billion-scale similarity search with gpus</title>
		<author>
			<persName><forename type="first">J</forename><surname>Johnson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Douze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jégou</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Big Data</title>
		<imprint>
			<biblScope unit="volume">7</biblScope>
			<biblScope unit="page" from="535" to="547" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
