<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">DS@BioMed at ImageCLEFmedical Caption 2024: Enhanced Attention Mechanisms in Medical Caption Generation through Concept Detection Integration</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Ngoc-Yen</forename><surname>Nhi</surname></persName>
						</author>
						<author>
							<persName><surname>Nguyen</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Huy</forename><surname>Le Tu</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Phuong</forename><forename type="middle">Dieu</forename><surname>Nguyen</surname></persName>
						</author>
						<author>
							<persName><forename type="first">Tan</forename><forename type="middle">Nhat</forename><surname>Do</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Triet</forename><forename type="middle">Minh</forename><surname>Thai</surname></persName>
							<email>triettm@oucru.org</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff2">
								<orgName type="institution">Oxford University Clinical Research Unit</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Thien</forename><forename type="middle">B</forename><surname>Nguyen-Tat</surname></persName>
							<email>thienntb@uit.edu.vn</email>
							<affiliation key="aff0">
								<orgName type="institution">University of Information Technology</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="institution">Vietnam National University</orgName>
								<address>
									<addrLine>Ho Chi</addrLine>
									<settlement>Minh City</settlement>
									<country key="VN">Vietnam</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">DS@BioMed at ImageCLEFmedical Caption 2024: Enhanced Attention Mechanisms in Medical Caption Generation through Concept Detection Integration</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">B612B33C7F0503D8B32DE4EB95172614</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:52+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Medical Caption Generation</term>
					<term>Multimodal Learning</term>
					<term>Concept Detection</term>
					<term>ImageCLEF 2024</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Purpose: Our study presents an enhanced approach to medical image caption generation by integrating concept detection into attention mechanisms. Method: This method utilizes sophisticated models to identify critical concepts within medical images, which are then refined and incorporated into the caption generation process. Results: Our concept detection task, which employed the Swin-v2 model, achieved an F1 score of 0.58944 on the validation set and 0.61998 on the private test set, securing the third position. For the caption prediction task, our BEiT+BioBart model, enhanced with concept integration and post-processing techniques, attained a BERTScore of 0.60589 on the validation set and 0.5794 on the private test set, placing ninth. Conclusion: These results underscore the efficacy of concept-aware algorithms in generating precise and contextually appropriate medical descriptions. The findings demonstrate that our approach considerably improves the quality of medical image captions, highlighting its potential to enhance medical image interpretation and documentation, thereby contributing to improved healthcare outcomes.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>The rapid growth of deep learning techniques has profoundly influenced various sectors, notably medical imaging <ref type="bibr" target="#b0">[1]</ref>. Among these advancements, using neural networks in radiology has garnered considerable attention due to its potential to enhance diagnostic accuracy and efficiency <ref type="bibr" target="#b1">[2]</ref>. A particularly intriguing development in this field is the automatic generation of medical captions from radiology images <ref type="bibr" target="#b2">[3]</ref>. This innovation aims to assist radiologists by providing preliminary interpretations and streamlining clinical documentation. Medical caption generation transforms visual information from radiological images into coherent, clinically valuable language descriptions. This process is inherently challenging due to the complexity and diversity of medical images, the need for precise and context-aware descriptions, and the necessity to incorporate domain-specific knowledge <ref type="bibr" target="#b2">[3,</ref><ref type="bibr" target="#b3">4,</ref><ref type="bibr" target="#b4">5]</ref>.</p><p>Traditional systems often fall short of these requirements, leading to the development of advanced attention mechanisms that can more effectively capture and interpret the intricate details found in radiological images. Recent research shows that integrating concept detection into caption generation algorithms improves performance <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7]</ref>. Concept detection involves identifying and categorizing critical visual elements in an image, such as anatomical structures, pathological findings, and medical devices. By incorporating these detected concepts into the caption generation process, models can produce more accurate and contextually relevant descriptions. One of the advancements in this field is the ImageCLEF campaign, an annual multimodal machine learning competition established in 2003.</p><p>ImageCLEF <ref type="bibr" target="#b7">[8]</ref> fosters advancements in multimedia processing, including computer vision, image analysis, classification, and retrieval in multilingual and multimodal contexts. In ImageCLEF 2024 <ref type="bibr" target="#b7">[8]</ref>, participants engaged in the ImageCLEFmedical Caption task <ref type="bibr" target="#b8">[9]</ref>, which included two subtasks: concept detection, aiming to identify critical elements within medical images, and caption prediction, focused on generating descriptive texts based on identified concepts. Concept detection aims to associate biomedical images with relevant medical concepts, thereby enhancing diagnostic notes by identifying key concepts that should be included in preliminary reports. Moreover, it facilitates the efficient organization and retrieval of medical images by indexing them according to related concepts. Caption prediction, or diagnostic captioning, remains a complex research challenge intended to support the diagnostic process by providing preliminary reports, rather than replacing physicians. This approach aids experienced clinicians in managing high volumes of daily medical examinations more swiftly and efficiently, while also reducing the likelihood of clinical errors among less experienced clinicians.</p><p>Our findings underscore that integrating concept detection enhances the efficacy of attention mechanisms and yields more coherent and diagnostically valuable captions. This research advances the development of intelligent technologies aimed at supporting radiologists in clinical practice, thereby elevating the standard of patient care. Section 2 provides a comprehensive review of pertinent literature. Section 3 outlines our dataset, while Section 4 describes our proposed methodology and presents experimental results. In Section 5, we discuss the conclusions drawn from our findings and outline avenues for future research. Our objective is to contribute to the fields of medical imaging and natural language processing by enhancing the capabilities of medical caption generation, thus paving the way for further advancements in automated reporting and medical data interpretation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Background and RelatedWorks</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Former Medical Datasets</head><p>Medical imaging has been a focal point in the application of deep learning, benefiting from the availability of comprehensive datasets. Early datasets such as the NIH ChestX-ray14 <ref type="bibr" target="#b9">[10]</ref> provided a large collection of chest radiographs annotated with disease labels, facilitating advancements in image classification and disease detection tasks. The MIMIC-CXR dataset <ref type="bibr" target="#b10">[11]</ref>, developed by Johnson et al., further enriched the field by offering not only radiographic images but also paired radiology reports, enabling research in image-to-text generation. These datasets have been pivotal in training and validating deep learning models, providing the groundwork for more sophisticated tasks such as medical caption generation and concept detection.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Related Work Concept Detection</head><p>Concept detection in medical imaging involves identifying and categorizing essential visual elements such as anatomical structures, pathological findings, and medical devices. This task is crucial for generating accurate and contextually relevant medical captions. Early methods primarily relied on traditional machine learning techniques, which often struggled with the complexity and variability of medical images (e.g., SVMs (support vector machines), random forests, and k-nearest neighbors). However, recent advancements in deep learning, particularly CNNs (convolutional neural networks), have improved the accuracy of concept detection. Notable CNN architectures such as ResNet50 <ref type="bibr" target="#b11">[12]</ref> and EfficientNet <ref type="bibr" target="#b12">[13]</ref> have demonstrated substantial improvements in detecting and classifying visual elements in medical images.</p><p>Recently, Transformer-based models have been increasingly applied to concept detection due to their ability to capture long-range dependencies and contextual information. Notable examples include ViT (Vision Transformer) <ref type="bibr" target="#b13">[14]</ref>, BEiT (Bidirectional Encoder representation from Image Transformers) <ref type="bibr" target="#b14">[15]</ref>, and Swin Transformer <ref type="bibr" target="#b15">[16]</ref>. These models provide robust feature representations and have shown promise in enhancing the accuracy and interpretability of medical image analysis.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Related Work Caption Prediction</head><p>Caption prediction, or diagnostic captioning, involves generating descriptive text that accurately summarizes the medical content of an image. This task extends beyond simple image annotation, requiring models to produce coherent and clinically meaningful narratives. Traditional captioning methods often used template-based approaches, which lacked flexibility and adaptability to different medical contexts. With the advent of deep learning, particularly sequence-to-sequence models and attention mechanisms, more sophisticated captioning systems have been developed.</p><p>For example, Jing et al. proposed a hierarchical LSTM (Long Short-Term Memory) <ref type="bibr" target="#b16">[17]</ref> model combined with a co-attention mechanism to generate detailed radiology reports from medical images. Their model effectively captured the hierarchical structure of medical reports, producing more detailed and contextually appropriate captions <ref type="bibr" target="#b2">[3]</ref>.</p><p>The introduction of Transformer models specifically designed for the medical domain has advanced the field of medical image captioning. Transformers, particularly models like BioBERT (Bidirectional Encoder Representations from Transformers for Biomedical Text Mining) <ref type="bibr" target="#b17">[18]</ref>, have demonstrated exceptional capabilities in understanding and generating biomedical text due to their ability to handle complex medical terminology and contexts. Recent research has leveraged these models to improve medical captioning. Additionally, LLMs (large language models) such as BioGPT <ref type="bibr" target="#b18">[19]</ref> have been explored for their potential to generate coherent and diagnostically valuable medical captions, further pushing the boundaries of automated reporting in radiology.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Task and Dataset Descriptions</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Task Descriptions</head><p>ImageCLEF has included medical tasks annually since 2004. Since 2019, it has focused each medical task on a specific issue but combined them into a single task with multiple subtasks. Four tasks are proposed for 2024: Image Captioning, Image Question Answering for Colonoscopy Images, MEDIQA-MAGIC, Quality Control of Synthesized Medical Images Generated by GANs. In ImageCLEF 2024 <ref type="bibr" target="#b7">[8]</ref>, we engage in the Image Captioning task <ref type="bibr" target="#b8">[9]</ref>, simultaneously participating in two subtasks: Concept Detection Task and Caption Prediction Task, each crucial in the holistic process of generating informative captions for medical images.</p><p>• Concept Detection Task: The Concept Detection Task involves using a refined subset of the UMLS 2022 AB version for concept generation. This subset is carefully selected to enhance the accuracy of concept detection by filtering concepts based on their semantic types. Moreover, to optimize concept detection from images, a stringent exclusion criterion is applied to remove low-frequency concepts, based on insights from previous iterations. • Caption Prediction Task: In the Caption Prediction Task, a series of meticulous preprocessing steps are undertaken to ensure the integrity and coherence of the captioning process. Specifically, the removal of embedded hyperlinks within captions is performed as a fundamental preprocessing step. This careful action helps maintain data cleanliness and consistency, thereby supporting subsequent analytical processes and enabling accurate caption prediction outcomes.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Dataset Information</head><p>The data for the captioning task will consist of images selected from medical literature, including annotations and related UMLS terms manually curated as metadata. For the development dataset, Radiology Objects in COntext Version 2 (ROCOv2) <ref type="bibr" target="#b19">[20]</ref>, an updated and expanded version of the Radiology Objects in COntext (ROCO) dataset <ref type="bibr" target="#b20">[21]</ref>, is used for both subtasks. As in previous versions, this dataset originates from biomedical articles in the PMC OpenAccess collection <ref type="bibr" target="#b21">[22]</ref>, with the test set comprising a set of previously unseen images.</p><p>• Training Dataset: Includes 70,108 images.</p><p>• Validation Dataset: Includes 9,972 images.</p><p>• Test Dataset: Includes 17,237 images.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Experiments and Results</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">The Proposed Approach</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.1.">Concept Detection Methodology</head><p>We aim to extract features from images by carefully examining and testing a variety of pretrained models that fall into three main architectural paradigms, which are shown in Table <ref type="table" target="#tab_0">1</ref>. The list that follows summarizes the particular models that are being examined:</p><p>• CNN-based architectures: Microsoft/ResNet-50 <ref type="bibr" target="#b22">[23]</ref>, an archetype of conventional convolutional neural network (CNN) models, characterized by its utilization of residual blocks to mitigate the challenges associated with gradient vanishing, thereby enhancing model performance within computationally tractable bounds. • Transformer-based architectures:</p><p>-ViT (Vision Transformer) <ref type="bibr" target="#b13">[14]</ref>: Pioneering the paradigm shift in image data processing, ViT adopts a transformative approach by encoding images into patch embeddings, followed by feature extraction using a Transformer encoder, reminiscent of text data processing methodologies. -DeiT (Data-efficient Image Transformers) <ref type="bibr" target="#b23">[24]</ref>: An evolution of ViT, DeiT emphasizes data efficiency, facilitating training with reduced data volumes while preserving commendable performance metrics. -Swin-v2 (Shifted Window Transformer v2) <ref type="bibr" target="#b24">[25]</ref>: Distinguished by its innovative utilization of self-attention mechanisms within shifted windows, Swin-v2 ameliorates computational complexity and augments performance across a spectrum of tasks, including image classification and segmentation. -BEiT (Bidirectional Encoder representation from Image Transformers) <ref type="bibr" target="#b25">[26]</ref>: At the confluence of Transformer and BERT architectures, BEiT excels in capturing robust image features through bidirectional encoding methodologies. -BiomedCLIP <ref type="bibr" target="#b26">[27]</ref>: A domain-specific adaptation of ViT tailored for biomedical applications, leveraging the CLIP architecture to enhance performance in medical domain tasks.</p><p>• Model Ensembles: In our ensemble framework, we leverage sophisticated fusion techniques to harness the collective predictive power of multiple models. A key method employed is weighted averaging, where predictions from each member model are aggregated based on their respective weights derived from validation performance.</p><p>- Following the feature extraction step, the retrieved features pass via a linear layer and classifier, where they are transformed and classified to provide outputs that correspond to the chosen class categories. This key step emphasizes the thorough orchestration of feature transformation and classification to produce predictions specific to the required class taxonomy. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Concept Filtering</head><p>A certain process must be followed while using the BEiT (Bidirectional Encoder Representations from Image Transformers) model in order to carry out idea filtering and modify the output threshold to detect variations in the outcomes. The following are the steps to follow: On a given dataset, do inference using the BEiT model and modify the output threshold to filter the ideas or classes.</p><p>Setting various threshold values and watching the ensuing outcomes allows for this modification. We may adjust and assess how different thresholds affect the model's performance using this procedure.  Given the primary focus on Image Captioning in this research, the architectural design must effectively extract salient features from both the image and its corresponding text, combining them to generate the final caption. Our carefully curated multimodal fusion architecture incorporates essential components like an image encoder for pertinent feature extraction, a text encoder for eliciting semantic information from text, and a decoder to synthesize insights from the textual context. Additionally, the fusion mechanism integrates image features and output classifications from concept detection, synergistically blending them with textual input to decode and generate the caption output. The proposed approach leverages the pretrained Bidirectional Encoder Representations from Transformers (BEiT) model for image feature extraction. Boasting a symmetric Transformer architecture, BEiT can comprehend image representational features by concurrently considering both surrounding image patches and global context. With its extensive training on copious data, BEiT can be fine-tuned and achieve state-of-the-art results across several computer vision and image processing benchmarks.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.2.">Captioning Methodology</head><p>To encode the input text captions, this research employs two domain-specific language models:</p><p>BioBART (Bidirectional and Auto-Regressive Transformers for Biomedical Text) and ClinicalT5 (Textto-Text Transfer Transformer fine-tuned on clinical data).</p><p>• BioBART <ref type="bibr" target="#b27">[28]</ref> is a version of the BART model <ref type="bibr" target="#b28">[29]</ref> adapted and further pre-trained on biomedical text data such as medical literature, case reports, and genomic analysis documents. Leveraging its bidirectional Transformer architecture, BioBART can effectively encode both general and biomedical domain-specific text, enabling the extraction of rich semantic representations for tasks like text summarization, medical question-answering, and report generation. • ClinicalT5 <ref type="bibr" target="#b29">[30]</ref> is the T5 model <ref type="bibr" target="#b30">[31]</ref> additionally fine-tuned on clinical text data including patient records and consultation reports. Harnessing its text-to-text transfer learning capability for multi-task modeling, ClinicalT5 can be applied to various natural language processing tasks in the healthcare domain, such as treatment classification, medical information extraction, and summarization of patient records.</p><p>For the process of encoding text concepts, we utilize the output from the BEiT model, which is specifically trained for the concept detection task. During this process, we apply a threshold of 0.5 to selectively retain predictions with a confidence score higher than 0.5, while discarding predictions with lower confidence scores. This discriminative process aids in capturing the semantic essence of the detected concepts, thereby facilitating their seamless integration into the multimodal fusion architecture for further processing and analysis.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Experimental Settings</head><p>Several experiments have been conducted to assess the efficacy of the proposed methodologies in addressing the ImageCLEFmedical Caption 2024 challenge. Specifically, each pre-trained vision model has been instantiated and evaluated, as detailed in Table <ref type="table" target="#tab_1">2</ref>, which offers a comprehensive overview of the pre-trained models employed in this study, encompassing their respective vision model designations, versions, and parameter counts for each fusion model. These experiments serve to elucidate both the potential and limitations inherent in each model with regard to the Image Captioning task, thereby facilitating the selection of the optimal approach for generating final predictions on the private test dataset of the competition.</p><p>• Concept Detection Task: For the concept detection subtask, the optimization criterion utilized during training is the AdamW optimizer <ref type="bibr" target="#b31">[32]</ref>. The models are trained for 5 epochs with a batch size of 30 and an initial learning rate of 5e-5. During training, the BCEWithLogitsLoss function, which combines a Sigmoid layer and BCELoss, is applied, and a threshold value ranging from 0.45 to 0.5 is predominantly used to process the model's output. To ensure meaningful comparison results, consistent hyperparameters are maintained across all experiments. • Caption Prediction Task: During the training process for the caption prediction task, the CrossEntropyLoss criterion is applied with the ignore_index parameter set to the pad token index of the tokenizer. This setup helps mitigate the influence of pad tokens on loss computation, ensuring more precise training outcomes. For optimization, the AdamW optimizer is utilized with a learning rate of 1e-4 and a weight decay rate of 0.01, chosen to balance training efficiency and model generalization <ref type="bibr" target="#b31">[32]</ref>. To leverage the benefits of Mixed Precision Training <ref type="bibr" target="#b32">[33]</ref>, the Gradient scaler is integrated into the training pipeline. This scaler adjusts the gradient scale, enhancing training efficiency and convergence speed of the models. Additionally, the LinearScheduleWithWarmup is employed to adjust the learning rate over time during training. This scheduling mechanism requires pre-defining the number of warmup steps and total training steps to optimize the learning rate schedule effectively. During each training iteration, a batch size of 16 is utilized. Overall, these training configurations and optimizations contribute to the performance and stability of the training process, leading to superior model performance.</p><p>The hardware utilized for computation included both NVIDIA Tesla T4 and NVIDIA Tesla P100 GPUs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Evaluation Methodology</head><p>Our evaluation consists of two tasks: Concept Detection and Caption Prediction. Each task uses specific metrics to measure performance.</p><p>• Concept Detection Task: We assess the performance of concept identification using Accuracy, Precision, Recall, and F1 score. These metrics measure overall correctness, positive prediction accuracy, relevant concept capture, and balanced precision and recall, respectively <ref type="bibr" target="#b33">[34]</ref>. • Caption Prediction Task: We evaluate the quality and coherence of generated captions using BERTScore (Bidirectional Encoder Representations from Transformers Score) <ref type="bibr" target="#b34">[35]</ref>, BLEU (Bilingual Evaluation Understudy, 1-4) <ref type="bibr" target="#b35">[36]</ref>, ROUGE (Recall-Oriented Understudy for Gisting Evaluation) <ref type="bibr" target="#b36">[37]</ref>, and METEOR (Metric for Evaluation of Translation with Explicit ORdering) <ref type="bibr" target="#b37">[38]</ref>. These metrics assess semantic similarity, fluency, relevance, coherence, informativeness, and lexical/syntactic aspects.</p><p>Using this diverse set of metrics, we ensure a comprehensive understanding of the model's performance and facilitate informed decision-making for further refinement.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4.">Experimental Results</head><p>As detailed in Table <ref type="table" target="#tab_1">2</ref>, the comparative evaluation of various concept detection models on the development validation set yields valuable insights into their performance across diverse evaluation metrics. Among these models, Swin-v2 emerges as the frontrunner, exhibiting the highest accuracy (0.16366), recall (0.47114), and F1 score (0.58944). This underscores Swin-v2's effectiveness in not only accurately identifying pertinent instances but also striking a harmonious balance between precision and recall, rendering it well-suited for concept detection endeavors. Ensemble methodologies, which predictions from multiple models, demonstrate promising outcomes as well. Notably, the Ensemble-2 model showcases commendable precision (0.94501) and a noteworthy F1 score (0.58581), suggesting that leveraging diverse models can augment predictive efficacy, particularly in precision-oriented tasks. While the Ensemble-4 model marginally surpasses Ensemble-2 in precision (0.94508), it exhibits a slightly lower F1 score (0.58460), implying a subtle trade-off in recall when employing additional models. BEiT-L and BiomedCLIP also manifest robust performance metrics. BEiT-L achieves an accuracy of 0.16145 and an F1 score of 0.58418, while BiomedCLIP demonstrates balanced performance with an accuracy of 0.15975 and an F1 score of 0.58319. These findings underscore the efficacy of these models in maintaining high precision and achieving a favorable balance with recall.</p><p>Other models such as BEiT-B, DeiT-B, and ViT-B exhibit commendable performance, albeit slightly trailing the top performers. For instance, BEiT-B records an accuracy of 0.15554 and an F1 score of 0.57662, indicating respectable yet not leading-edge performance. Similarly, DeiT-B and ViT-B attain comparable results, with DeiT-B registering an accuracy of 0.15674 and an F1 score of 0.57641, and ViT-B yielding an accuracy of 0.15413 and an F1 score of 0.57439. Conversely, ResNet-50 demonstrates notably inferior performance across all metrics, with an accuracy of 0.11412 and an F1 score of 0.51566. This underscores its relatively limited efficacy in the concept detection task.</p><p>In summation, the Swin-v2 model emerges as the most dependable choice for concept detection owing to its superior accuracy, recall, and F1 score. Ensemble methodologies, particularly Ensemble-2, exhibit robust performance, underscoring the advantages of model amalgamation. BEiT-L and BiomedCLIP offer balanced performance, rendering them viable alternatives. Meanwhile, ResNet-50's diminished performance suggests its lesser suitability for this specific task, underscoring the strides made by newer architectural advancements. As detailed in Table <ref type="table" target="#tab_2">3</ref>, the comparative analysis of various model configurations on the validation set reveals insights into the efficacy of incorporating concepts and post-processing techniques in caption generation tasks. The models evaluated include BEiT+BioBart and BEiT+Clinical-T5, with configurations either incorporating concepts derived from the Concept Detection subtask or excluding them, and applying post-processing to mitigate repetition in the output captions. The results indicate that for the BEiT+BioBart model, the inclusion of concepts and the application of post-processing do not result in any variation in performance across all evaluated metrics, including BERTScore, BLEU (from 1 to 4), ROUGE, and METEOR. This suggests that for BEiT+BioBart, the post-processing step does not impact the model's ability to generate captions when concepts are included, maintaining consistent performance.</p><p>In contrast, the BEiT+Clinical-T5 model demonstrates a more nuanced response to the incorporation of concepts and post-processing. When concepts are included without post-processing, there is a slight decline in BERTScore compared to the configuration without concepts. However, BLEU, ROUGE, and METEOR scores show an improvement with the inclusion of concepts, highlighting the potential benefits of concept integration in enhancing the model's performance in these specific metrics. Notably, when post-processing is applied, the BEiT+Clinical-T5 model exhibits substantial improvements across all metrics, irrespective of the presence of concepts. This improvement underscores the critical role of post-processing in refining output quality, with the highest METEOR score observed in the configuration without concepts but with post-processing. Comparing the two models, BEiT+Clinical-T5 generally outperforms BEiT+BioBart in BLEU, ROUGE, and METEOR scores. This superior performance is particularly evident when post-processing is applied, suggesting that BEiT+Clinical-T5 is more responsive to post-processing enhancements. However, BEiT+BioBart achieves a higher BERTScore when concepts are included, indicating a potential strength in semantic similarity measures.</p><p>In conclusion, the analysis underscores the importance of model selection, the strategic inclusion of concepts, and the application of post-processing in optimizing caption generation performance. BEiT+Clinical-T5 emerges as a more robust model with gains from post-processing, while BEiT+BioBart maintains consistent performance with concept inclusion. These findings provide valuable insights for future research and development in automated caption generation systems, emphasizing tailored approaches for different model architectures.</p><p>As detailed in Table <ref type="table" target="#tab_3">4</ref>, the performance evaluation of different models on the validation and private test sets provides a comprehensive understanding of their effectiveness across various configurations and datasets. For concept detection, three configurations were assessed: Concept BEiT-B with a threshold of 0.45, Detection BEiT-B with a threshold of 0.5, and Swin-v2 with a threshold of 0.5. The results reveal For caption prediction, four configurations were evaluated: BEiT+Clinical-T5 without concepts and without post-processing, BEiT+Clinical-T5 with concepts and without post-processing, BEiT+Clinical-T5 with concepts and with post-processing, and BEiT+BioBart with concepts and with post-processing. The BEiT+Clinical-T5 model without concepts and post-processing scored 0.46001 on the validation set and 0.4433 on the private test set, while adding concepts slightly improved the private test set score to 0.4453. However, the most considerable performance boost was observed when post-processing was applied to the BEiT+Clinical-T5 model with concepts, raising the scores to 0.57597 on the validation set and 0.558 on the private test set. This highlights the substantial role of post-processing in enhancing model performance.</p><p>Moreover, the BEiT+BioBart model with concepts and post-processing achieved the highest scores among all configurations, with 0.60589 on the validation set and 0.5794 on the private test set. This underscores the effectiveness of combining concepts with post-processing in the BioBart architecture, suggesting that such integration can improve caption generation quality. Overall, the analysis emphasizes the critical influence of model configuration, the integration of concepts, and the application of post-processing on the performance outcomes. The superior performance of the Swin-v2 model for concept detection and the BEiT+BioBart model for caption prediction indicates that different models may excel in specific sub-tasks, advocating for a nuanced approach in model selection and optimization based on the task requirements and dataset characteristics. As detailed in Table <ref type="table" target="#tab_4">5</ref>, when employing the BEiT model in conjunction with ClinicalT5 for medical image analysis, several notable errors have been observed across various dimensions. These errors include incorrect identification of regions or image types, omissions in providing specific details, and inaccuracies in context, thereby impacting the overall reliability of the model's results. The model occasionally encounters difficulties in accurately identifying regions of interest within the images.  <ref type="table" target="#tab_4">5</ref>. a) ImageCLEFmedical_Caption_2024_valid_009001 is an example of Table <ref type="table" target="#tab_4">5</ref>. b) ImageCLEFmedical_Caption_2024_valid_009698 is an example of Table <ref type="table" target="#tab_4">5</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.5.">Error Analysis</head><p>For instance, it might misinterpret an anteroposterior X-ray of the pelvis as indicating bilateral tibial fractures. Similarly, it might incorrectly classify a cross-sectional, contrast-enhanced CT scan of the larynx as a left renal tumor.</p><p>Omissions in providing specific details have become evident in the model's predictions. The model often fails to provide the complex details necessary for comprehensive clinical interpretation. For example, it may overlook critical features such as the eccentric position of a metallic head in an Xray or the presence of stratified bile in a CT scan. Moreover, contextual inaccuracies are common, leading to misleading or entirely incorrect descriptions. The model sometimes struggles to grasp the broader context of medical images, resulting in descriptions that do not align appropriately with the actual content of the images. Similarly, when utilizing the BEiT model in combination with BioBART, analogous errors have been observed across various aspects. These include incorrect identification of regions or image types, omissions in providing specific details, and contextual inaccuracies. Comparing BEiT with ClinicalT5 and BEiT with BioBART, although both models exhibit similar error patterns, there are minor differences in their performance. BEiT combined with ClinicalT5 demonstrates slightly better performance in certain aspects, such as providing more accurate descriptions and better contextual understanding. Conversely, BEiT combined with BioBART shows a slight advantage in specific scenarios, particularly in identifying anatomical structures or image types. However, both models have room for improvement, highlighting ongoing challenges in developing robust and reliable automated methods for medical image analysis. In both models, conceptual errors frequently occur, indicating a mismatch between the predicted concept and the actual content of the medical images. These errors underscore the challenges in accurately interpreting and classifying medical images based on their content.</p><p>To enhance the accuracy of medical image analysis models, a range of strategies must be employed to improve data quality, model architecture, and training processes. Firstly, the use of high-quality, well-annotated datasets is crucial. Combining this with data augmentation techniques such as rotation, zooming, flipping, and color adjustment can help increase the size and diversity of the training dataset, thereby enhancing the model's generalization capabilities. In terms of model architecture, employing models pre-trained on domain-specific datasets or state-of-the-art (SOTA) models that achieve superior results is essential. Furthermore, incorporating additional feature extraction from image data, such as bounding-boxes, segmentation, or advanced features, can help the model better understand the structure and context of the images. Finally, regularly testing and re-evaluating the model using diverse datasets will help in early detection of errors and timely adjustment of the model, ensuring the reliability and accuracy of medical image analysis results.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Conclusion and Future Works</head><p>In this study, an enhanced approach to medical caption generation was introduced by integrating concept detection into attention mechanisms. The method improved performance metrics, with the Swin-v2 model achieving an F1 score of 0.58944 on the validation set and 0.61998 on the private test set, earning 3rd place in concept detection. For caption prediction, the BEiT+BioBart model, augmented with concept integration and post-processing, achieved a BERTScore of 0.60589 on the validation set and 0.5794 on the private test set, securing 9th place. These results underscore the effectiveness of concept-aware systems in generating precise and contextually relevant medical descriptions.</p><p>Future work will focus on enhancing model performance through several avenues unrelated to data expansion. First, optimizing model architectures and training protocols can further improve accuracy and efficiency. Second, incorporating more advanced attention mechanisms and fine-tuning hyperparameters may yield better contextual understanding and caption quality. Third, integrating explainability techniques will ensure that model predictions are interpretable and trustworthy for healthcare professionals. Additionally, exploring transfer learning and domain adaptation techniques could enhance model performance across various medical imaging modalities. Furthermore, leveraging large language models (LLMs) such as GPT-3 and BioGPT for their potential to generate coherent and diagnostically valuable medical captions will be explored <ref type="bibr">[39] [19]</ref>. Finally, developing robust post-processing algorithms to further refine generated captions, ensuring they meet clinical standards, is planned. These efforts aim to advance the capabilities of medical image analysis and automated reporting systems, contributing to more sophisticated and reliable tools for the healthcare industry.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head></head><label></label><figDesc>Ensemble-2 model (Swin-v2 + BEiT): The symbiotic fusion of Swin-v2 and BEiT engenders a collaborative synergy, capitalizing on the distinctive strengths of each constituent model to surpass individual model performances. -Ensemble-4 model (Swin-v2 + BEiT + DeiT + ViT): Comprising a composite quartet of models, this ensemble fortifies accuracy and generalization capabilities through the combination of representatives from Transformer-based models.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 1</head><label>1</label><figDesc>Figure 1 depicts an overview of the proposed method for Medical Captioning task.</figDesc><graphic coords="5,121.64,403.91,352.00,141.49" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: An overview of the multimodal architecture for Medical Caption Generation challenge.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Example images of caption prediction. The images are arranged in sequential order in Table5. a) ImageCLEFmedical_Caption_2024_valid_009001 is an example of Table5. b) ImageCLEFmedical_Caption_2024_valid_009698 is an example of Table5.</figDesc><graphic coords="10,128.41,65.61,338.43,173.31" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Statistics of models for the Concept Detection subtask.</figDesc><table><row><cell>Models</cell><cell cols="2">Version Detailed</cell><cell># Parameters</cell></row><row><cell>Resnet-50</cell><cell>-</cell><cell>microsoft/resnet-50</cell><cell>27 122 124</cell></row><row><cell>BEiT</cell><cell>base</cell><cell>microsoft/beit-base-patch16-224</cell><cell>88 065 356</cell></row><row><cell>BEiT</cell><cell>base</cell><cell>microsoft/beit-base-patch16-224</cell><cell>88 065 356</cell></row><row><cell>Swin</cell><cell>v2</cell><cell>microsoft/swinv2-base-patch4-window12-192-22k</cell><cell>89 459 332</cell></row><row><cell>DeiT</cell><cell>base</cell><cell>facebook/deit-base-patch16-224</cell><cell>88 692 620</cell></row><row><cell>ViT</cell><cell>base</cell><cell>google/vit-base-patch16-224</cell><cell>88 692 620</cell></row><row><cell>BiomedCLIP</cell><cell>base</cell><cell>ikim-uk-essen/BiomedCLIP_ViT_patch16_224</cell><cell>88 692 620</cell></row><row><cell>BEIT</cell><cell>large</cell><cell>microsoft/beit-large-patch16-224</cell><cell>305 971 084</cell></row><row><cell>Ensemble-2</cell><cell>-</cell><cell>Swin-v2 + BEiT</cell><cell>-</cell></row><row><cell>Ensemble-4</cell><cell>-</cell><cell>Swin-v2 + BEiT + DeiT + ViT</cell><cell>-</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Comparative performance of the Concept Detection method on the validation set.</figDesc><table><row><cell>Models</cell><cell cols="3">Accuracy Precision Recall</cell><cell>F1</cell></row><row><cell>Resnet-50</cell><cell>0.11412</cell><cell>0.89235</cell><cell>0.39643</cell><cell>0.51566</cell></row><row><cell>BEiT-B</cell><cell>0.15554</cell><cell>0.93087</cell><cell>0.45961</cell><cell>0.57662</cell></row><row><cell>Swin-v2</cell><cell>0.16366</cell><cell>0.94428</cell><cell cols="2">0.47114 0.58944</cell></row><row><cell>DeiT-B</cell><cell>0.15674</cell><cell>0.93353</cell><cell>0.45849</cell><cell>0.57641</cell></row><row><cell>ViT-B</cell><cell>0.15413</cell><cell>0.93477</cell><cell>0.45571</cell><cell>0.57439</cell></row><row><cell>BiomedCLIP</cell><cell>0.15975</cell><cell>0.94095</cell><cell>0.46453</cell><cell>0.58319</cell></row><row><cell>BEIT-L</cell><cell>0.16145</cell><cell>0.93669</cell><cell>0.46700</cell><cell>0.58418</cell></row><row><cell>Ensemble-2</cell><cell>0.16155</cell><cell>0.94501</cell><cell>0.46683</cell><cell>0.58581</cell></row><row><cell>Ensemble-4</cell><cell>0.16135</cell><cell>0.94508</cell><cell>0.46526</cell><cell>0.58460</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>A comparative analysis of various configurations on the validation set, with "Process" denoting post-processing of output captions to mitigate repetition, and "Concepts" representing features potentially derived from the Concept Detection subtask.</figDesc><table><row><cell>Model</cell><cell>Configuration</cell><cell cols="7">BERTScore BLEU1 BLEU2 BLEU3 BLEU4 ROUGE METEOR</cell></row><row><cell>BEiT+BioBart</cell><cell>Concepts+No-Process</cell><cell>0.60589</cell><cell>0.03293</cell><cell>0.01019</cell><cell>0.00337</cell><cell>0.00040</cell><cell>0.10721</cell><cell>0.05673</cell></row><row><cell>BEiT+BioBart</cell><cell>Concepts+Process</cell><cell>0.60589</cell><cell>0.03293</cell><cell>0.01019</cell><cell>0.00337</cell><cell>0.00040</cell><cell>0.10721</cell><cell>0.05673</cell></row><row><cell cols="2">BEiT+Clinical-T5 Concepts+No-Process</cell><cell>0.45752</cell><cell>0.07408</cell><cell>0.03008</cell><cell>0.01244</cell><cell>0.00476</cell><cell>0.09298</cell><cell>0.08909</cell></row><row><cell cols="2">BEiT+Clinical-T5 Concepts+Process</cell><cell>0.57597</cell><cell cols="5">0.08145 0.03319 0.01423 0.00519 0.13336</cell><cell>0.09817</cell></row><row><cell cols="2">BEiT+Clinical-T5 No-Concepts+No-Process</cell><cell>0.46001</cell><cell>0.07501</cell><cell>0.03057</cell><cell>0.01077</cell><cell>0.00303</cell><cell>0.09711</cell><cell>0.09298</cell></row><row><cell cols="2">BEiT+Clinical-T5 No-Concepts+Process</cell><cell>0.57487</cell><cell>0.08110</cell><cell>0.03231</cell><cell>0.01137</cell><cell>0.00310</cell><cell>0.13293</cell><cell>0.10086</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 4</head><label>4</label><figDesc>Performance evaluation of different models on the validation set and private test set. model performs the best, achieving scores of 0.58944 on the validation set and 0.61998 on the private test set, suggesting superior capability in accurately detecting concepts compared to the BEiT-B models. The Concept BEiT-B model with a threshold of 0.45 also shows strong performance, though slightly lower than Swin-v2, indicating the threshold setting's impact on model efficacy.</figDesc><table><row><cell>#</cell><cell>Models</cell><cell>Configuration</cell><cell cols="2">Validation set Test set</cell></row><row><cell>Concept</cell><cell>BEiT-B</cell><cell>Threshold_0.45</cell><cell>0.57662</cell><cell>0.61079</cell></row><row><cell cols="2">Detection BEiT-B</cell><cell>Threshold_0.5</cell><cell>-</cell><cell>0.60904</cell></row><row><cell></cell><cell>Swin-v2</cell><cell>Threshold_0.5</cell><cell>0.58944</cell><cell>0.61998</cell></row><row><cell>Caption</cell><cell cols="2">BEiT+Clinical-T5 No-Concepts+No-Process</cell><cell>0.46001</cell><cell>0.4433</cell></row><row><cell cols="3">Prediction BEiT+Clinical-T5 Concepts+No-Process</cell><cell>0.45752</cell><cell>0.4453</cell></row><row><cell></cell><cell cols="2">BEiT+Clinical-T5 Concepts+Process</cell><cell>0.57597</cell><cell>0.558</cell></row><row><cell></cell><cell>BEiT+BioBart</cell><cell>Concepts+Process</cell><cell>0.60589</cell><cell>0.5794</cell></row><row><cell>that the Swin-v2</cell><cell></cell><cell></cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 5</head><label>5</label><figDesc>Example outputs of caption prediction from different models.</figDesc><table><row><cell>Ground Truth</cell><cell>BEiT + Clinical-T5</cell><cell>BEiT + BioBART</cell><cell>Predicted Concepts</cell></row><row><cell>Axial contrasted CT image of larynx, showing left sided glottic versus supraglottic mass.</cell><cell>CT scan showing mass lesion (arrow)</cell><cell>CT scan showing left renal mass</cell><cell>Magnetic Resonance Imaging</cell></row><row><cell>Chest X-ray face (solitary pulmonary nodule of the heart-phrenic angle).</cell><cell>Chest X-ray showing opacification (arrow) chest</cell><cell>Chest X-ray showing bilateral infiltrates</cell><cell>X-Ray Computed Tomography</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgment</head><p>This research is funded by University of Information Technology-Vietnam National University HoChiMinh City under grant number D4-2024-01.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">A survey on deep learning in medical image analysis</title>
		<author>
			<persName><forename type="first">G</forename><surname>Litjens</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Kooi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">E</forename><surname>Bejnordi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">A A</forename><surname>Setio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Ciompi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ghafoorian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">A</forename><surname>Van Der Laak</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Van Ginneken</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">I</forename><surname>Sãąnchez</surname></persName>
		</author>
		<idno type="DOI">10.1016/j.media.2017.07.005</idno>
		<ptr target="https://doi.org/10.1016/j.media.2017.07.005" />
	</analytic>
	<monogr>
		<title level="j">Medical Image Analysis</title>
		<imprint>
			<biblScope unit="volume">42</biblScope>
			<biblScope unit="page" from="60" to="88" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Dermatologist-level classification of skin cancer with deep neural networks</title>
		<author>
			<persName><forename type="first">A</forename><surname>Esteva</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Kuprel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">A</forename><surname>Novoa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ko</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">M</forename><surname>Swetter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">M</forename><surname>Blau</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Thrun</surname></persName>
		</author>
		<idno type="DOI">10.1038/nature21056</idno>
	</analytic>
	<monogr>
		<title level="j">Nature</title>
		<imprint>
			<biblScope unit="volume">542</biblScope>
			<biblScope unit="page" from="115" to="118" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">On the automatic generation of medical imaging reports</title>
		<author>
			<persName><forename type="first">B</forename><surname>Jing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Xing</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/P18-1240</idno>
		<ptr target="https://aclanthology.org/P18-1240.doi:10.18653/v1/P18-1240" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">I</forename><surname>Gurevych</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">Y</forename><surname>Miyao</surname></persName>
		</editor>
		<meeting>the 56th Annual Meeting of the Association for Computational Linguistics<address><addrLine>Melbourne, Australia</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="2577" to="2586" />
		</imprint>
	</monogr>
	<note>: Long Papers), Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Knowledge-driven encode, retrieve, paraphrase for medical image report generation</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Liang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><forename type="middle">P</forename><surname>Xing</surname></persName>
		</author>
		<idno type="DOI">10.1609/aaai.v33i01.33016666</idno>
		<ptr target="https://doi.org/10.1609/aaai.v33i01.33016666.doi:10.1609/aaai.v33i01.33016666" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Thirty-Third AAAI Conference on Artificial Intelligence and Thirty-First Innovative Applications of Artificial Intelligence Conference and Ninth AAAI Symposium on Educational Advances in Artificial Intelligence, AAAI&apos;19/IAAI&apos;19/EAAI&apos;19</title>
				<meeting>the Thirty-Third AAAI Conference on Artificial Intelligence and Thirty-First Innovative Applications of Artificial Intelligence Conference and Ninth AAAI Symposium on Educational Advances in Artificial Intelligence, AAAI&apos;19/IAAI&apos;19/EAAI&apos;19</meeting>
		<imprint>
			<publisher>AAAI Press</publisher>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Learning to read chest x-rays: Recurrent neural cascade model for automated image annotation</title>
		<author>
			<persName><forename type="first">H.-C</forename><surname>Shin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Demner-Fushman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Summers</surname></persName>
		</author>
		<idno type="DOI">10.1109/CVPR.2016.274</idno>
	</analytic>
	<monogr>
		<title level="m">IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</title>
				<imprint>
			<date type="published" when="2016">2016. 2016</date>
			<biblScope unit="page" from="2497" to="2506" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Concept-aware video captioning: Describing videos with effective prior information</title>
		<author>
			<persName><forename type="first">B</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zou</surname></persName>
		</author>
		<idno type="DOI">10.1109/TIP.2023.3307969</idno>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Image Processing</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="5366" to="5378" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Improving image captioning via predicting structured concepts</title>
		<author>
			<persName><forename type="first">T</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Mao</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.emnlp-main.25</idno>
		<ptr target="https://aclanthology.org/2023.emnlp-main.25.doi:10.18653/v1/2023.emnlp-main.25" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">H</forename><surname>Bouamor</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Pino</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Bali</surname></persName>
		</editor>
		<meeting>the 2023 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="360" to="370" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEF 2024: Multimedia retrieval in medical applications</title>
		<author>
			<persName><forename type="first">B</forename><surname>Ionescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Drăgulinescu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>García Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Andrei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Prokopchuk</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Karpenka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radzhabov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kovalev</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Macaire</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Schwab</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Lecouteux</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Esperança-Rodier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Yetisgen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Xia</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Hicks</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Riegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Thambawita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Storås</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Halvorsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Heinrich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kiesel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Potthast</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Stein</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Experimental IR Meets Multilinguality, Multimodality, and Interaction, Proceedings of the 15th International Conference of the CLEF Association (CLEF 2024</title>
		<title level="s">Springer Lecture Notes in Computer Science LNCS</title>
		<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Overview of ImageCLEFmedical 2024 -Caption Prediction and Concept Detection</title>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ben Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G</forename><surname>Seco De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bracke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Damm</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">M G</forename><surname>Pakull</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">CLEF2024 Working Notes, CEUR Workshop Proceedings</title>
				<meeting><address><addrLine>Grenoble, France</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases</title>
		<author>
			<persName><forename type="first">X</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bagheri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Summers</surname></persName>
		</author>
		<idno type="DOI">10.1109/CVPR.2017.369</idno>
	</analytic>
	<monogr>
		<title level="m">IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</title>
				<imprint>
			<date type="published" when="2017">2017. 2017</date>
			<biblScope unit="page" from="3462" to="3471" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Physiobank, physiotoolkit, and physionet: components of a new research resource for complex physiologic signals</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">L</forename><surname>Goldberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">A</forename><surname>Amaral</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Glass</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Hausdorff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">C</forename><surname>Ivanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Mark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Mietus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">B</forename><surname>Moody</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C.-K</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">E</forename><surname>Stanley</surname></persName>
		</author>
		<idno type="DOI">10.1161/01.CIR.101.23.e215</idno>
	</analytic>
	<monogr>
		<title level="j">Circulation</title>
		<imprint>
			<biblScope unit="volume">101</biblScope>
			<biblScope unit="page" from="E215" to="E220" />
			<date type="published" when="2000">2000</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Deep residual learning for image recognition</title>
		<author>
			<persName><forename type="first">K</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sun</surname></persName>
		</author>
		<idno type="DOI">10.1109/CVPR.2016.90</idno>
	</analytic>
	<monogr>
		<title level="m">IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</title>
				<imprint>
			<date type="published" when="2016">2016. 2016</date>
			<biblScope unit="page" from="770" to="778" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<title level="m" type="main">EfficientNet: Rethinking model scaling for convolutional neural networks</title>
		<author>
			<persName><forename type="first">M</forename><surname>Tan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><forename type="middle">V</forename><surname>Le</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2019">2019</date>
			<publisher>ICML</publisher>
			<biblScope unit="page" from="6105" to="6114" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">An image is worth 16x16 words: Transformers for image recognition at scale</title>
		<author>
			<persName><forename type="first">A</forename><surname>Dosovitskiy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Beyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kolesnikov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Weissenborn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Unterthiner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dehghani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Minderer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Heigold</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gelly</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Houlsby</surname></persName>
		</author>
		<idno>abs/2010.11929</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">BEit: BERT pre-training of image transformers</title>
		<author>
			<persName><forename type="first">H</forename><surname>Bao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Dong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wei</surname></persName>
		</author>
		<idno>abs/2106</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page">8254</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Swin transformer: Hierarchical vision transformer using shifted windows</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Guo</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF international conference on computer vision</title>
				<meeting>the IEEE/CVF international conference on computer vision</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="10012" to="10022" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Long short-term memory</title>
		<author>
			<persName><forename type="first">S</forename><surname>Hochreiter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
		<idno type="DOI">10.1162/neco.1997.9.8.1735</idno>
	</analytic>
	<monogr>
		<title level="j">Neural Comput</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="page" from="1735" to="1780" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">BioBERT: a pre-trained biomedical language representation model for biomedical text mining</title>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Yoon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">H</forename><surname>So</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Bioinformatics</title>
		<imprint>
			<biblScope unit="volume">36</biblScope>
			<biblScope unit="page" from="1234" to="1240" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">BioGPT: generative pre-trained transformer for biomedical text generation and mining</title>
		<author>
			<persName><forename type="first">R</forename><surname>Luo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Cheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Bi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Briefings in Bioinformatics</title>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Bloch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Brüngel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Idrissi-Yaghir</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Schäfer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">B</forename><surname>Abacha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">G S</forename><surname>De Herrera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Müller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">A</forename><surname>Horn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
		<idno type="DOI">10.1038/s41597-024-03496-6</idno>
		<ptr target="https://arxiv.org/abs/2405.10004v1.doi:10.1038/s41597-024-03496-6" />
		<title level="m">ROCOv2: Radiology Objects in COntext version 2, an updated multimodal image dataset</title>
				<imprint>
			<publisher>Scientific Data</publisher>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">Radiology objects in context (roco): a multimodal image dataset</title>
		<author>
			<persName><forename type="first">O</forename><surname>Pelka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Koitka</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Rückert</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nensa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">M</forename><surname>Friedrich</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Intravascular Imaging and Computer Assisted Stenting and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis: 7th Joint International Workshop, CVII-STENT 2018 and Third International Workshop</title>
		<title level="s">Proceedings</title>
		<meeting><address><addrLine>LABELS; Granada, Spain</addrLine></address></meeting>
		<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2018-09-16">2018. September 16, 2018. 2018</date>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="180" to="189" />
		</imprint>
	</monogr>
	<note>Held in Conjunction with MICCAI 2018</note>
</biblStruct>

<biblStruct xml:id="b21">
	<monogr>
		<ptr target="https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/" />
		<title level="m">PMC open access subset</title>
				<imprint>
			<date type="published" when="2003">2003. 2024 May 30</date>
		</imprint>
		<respStmt>
			<orgName>National Library of Medicine</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Deep residual learning for image recognition</title>
		<author>
			<persName><forename type="first">K</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Sun</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE conference on computer vision and pattern recognition</title>
				<meeting>the IEEE conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="page" from="770" to="778" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Training data-efficient image transformers &amp; distillation through attention</title>
		<author>
			<persName><forename type="first">H</forename><surname>Touvron</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cord</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Douze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Massa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sablayrolles</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Jegou</surname></persName>
		</author>
		<ptr target="https://proceedings.mlr.press/v139/touvron21a.html" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 38th International Conference on Machine Learning</title>
				<editor>
			<persName><forename type="first">M</forename><surname>Meila</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</editor>
		<meeting>the 38th International Conference on Machine Learning<address><addrLine>PMLR</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="volume">139</biblScope>
			<biblScope unit="page" from="10347" to="10357" />
		</imprint>
	</monogr>
	<note>Proceedings of Machine Learning Research</note>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Swin transformer v2: Scaling up capacity and resolution</title>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ning</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Dong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Guo</surname></persName>
		</author>
		<idno type="DOI">10.1109/cvpr52688.2022.01170</idno>
	</analytic>
	<monogr>
		<title level="m">Computer Vision and Pattern Recognition</title>
				<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="11999" to="12009" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">BEit: BERT pre-training of image transformers</title>
		<author>
			<persName><forename type="first">H</forename><surname>Bao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Dong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wei</surname></persName>
		</author>
		<idno>abs/2106</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page">8254</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<author>
			<persName><forename type="first">S</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Usuyama</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bagga</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Tinn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Preston</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Rao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Wei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Valluri</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Wong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Tupini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mazzola</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Shukla</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Liden</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Gao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">P</forename><surname>Lungren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Naumann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Poon</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2303.00915</idno>
		<title level="m">BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs</title>
				<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">BioBART: Pretraining and evaluation of a biomedical generative language model</title>
		<author>
			<persName><forename type="first">H</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Gan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Yu</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2022.bionlp-1.9</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 21st Workshop on Biomedical Language Processing</title>
				<meeting>the 21st Workshop on Biomedical Language Processing</meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</title>
		<author>
			<persName><forename type="first">M</forename><surname>Lewis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Goyal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Ghazvininejad</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mohamed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Stoyanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Zettlemoyer</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2020.acl-main.703</idno>
		<ptr target="https://aclanthology.org/2020.acl-main.703.doi:10.18653/v1/2020.acl-main.703" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">D</forename><surname>Jurafsky</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Chai</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">N</forename><surname>Schluter</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Tetreault</surname></persName>
		</editor>
		<meeting>the 58th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</meeting>
		<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="7871" to="7880" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<analytic>
		<title level="a" type="main">ClinicalT5: A generative language model for clinical text</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Lu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Nguyen</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2022.findings-emnlp.398</idno>
		<ptr target="https://aclanthology.org/2022.findings-emnlp.398.doi:10.18653/v1/2022.findings-emnlp.398" />
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: EMNLP 2022, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">Y</forename><surname>Goldberg</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">Z</forename><surname>Kozareva</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</editor>
		<meeting><address><addrLine>Abu Dhabi, United Arab Emirates</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="5436" to="5443" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b30">
	<analytic>
		<title level="a" type="main">Exploring the limits of transfer learning with a unified text-to-text transformer</title>
		<author>
			<persName><forename type="first">C</forename><surname>Raffel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Roberts</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Matena</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">J</forename><surname>Liu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Machine Learning Research</title>
		<imprint>
			<biblScope unit="volume">21</biblScope>
			<biblScope unit="page" from="1" to="67" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b31">
	<analytic>
		<title level="a" type="main">Adam: A method for stochastic optimization</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">P</forename><surname>Kingma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ba</surname></persName>
		</author>
		<idno>abs/1412</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2014">2014</date>
			<biblScope unit="page">6980</biblScope>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Mixed precision training</title>
		<author>
			<persName><forename type="first">P</forename><surname>Micikevicius</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Narang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Alben</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Diamos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Elsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Garcia</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<analytic>
		<title level="a" type="main">A systematic analysis of performance measures for classification tasks</title>
		<author>
			<persName><forename type="first">M</forename><surname>Sokolova</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lapalme</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Information Processing &amp; Management</title>
		<imprint>
			<biblScope unit="volume">45</biblScope>
			<biblScope unit="page" from="427" to="437" />
			<date type="published" when="2009">2009</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">BERTScore: Evaluating text generation with BERT</title>
		<author>
			<persName><forename type="first">T</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Kishore</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">Q</forename><surname>Weinberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Artzi</surname></persName>
		</author>
		<idno>abs/1904.09675</idno>
	</analytic>
	<monogr>
		<title level="m">International Conference on Learning Representations</title>
				<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">Bleu: a method for automatic evaluation of machine translation</title>
		<author>
			<persName><forename type="first">K</forename><surname>Papineni</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Roukos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Ward</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W.-J</forename><surname>Zhu</surname></persName>
		</author>
		<idno type="DOI">10.3115/1073083.1073135</idno>
		<ptr target="https://aclanthology.org/P02-1040.doi:10.3115/1073083.1073135" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">P</forename><surname>Isabelle</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">E</forename><surname>Charniak</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">D</forename><surname>Lin</surname></persName>
		</editor>
		<meeting>the 40th Annual Meeting of the Association for Computational Linguistics, Association for Computational Linguistics<address><addrLine>Philadelphia, Pennsylvania, USA</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2002">2002</date>
			<biblScope unit="page" from="311" to="318" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b36">
	<analytic>
		<title level="a" type="main">ROUGE: A package for automatic evaluation of summaries</title>
		<author>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/W04-1013" />
	</analytic>
	<monogr>
		<title level="m">Text Summarization Branches Out, Association for Computational Linguistics</title>
				<meeting><address><addrLine>Barcelona, Spain</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2004">2004</date>
			<biblScope unit="page" from="74" to="81" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b37">
	<analytic>
		<title level="a" type="main">METEOR: An automatic metric for MT evaluation with improved correlation with human judgments</title>
		<author>
			<persName><forename type="first">S</forename><surname>Banerjee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lavie</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/W05-0909" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization, Association for Computational Linguistics</title>
				<editor>
			<persName><forename type="first">J</forename><surname>Goldstein</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Lavie</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C.-Y</forename><surname>Lin</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">C</forename><surname>Voss</surname></persName>
		</editor>
		<meeting>the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization, Association for Computational Linguistics<address><addrLine>Ann Arbor, Michigan</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2005">2005</date>
			<biblScope unit="page" from="65" to="72" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b38">
	<analytic>
		<title level="a" type="main">Language models are few-shot learners</title>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">B</forename><surname>Brown</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Mann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ryder</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Subbiah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kaplan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Dhariwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Neelakantan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Shyam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Sastry</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Askell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Agarwal</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Herbert-Voss</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Krueger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Henighan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Child</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Ramesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Ziegler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Winter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Hesse</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Sigler</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Litwin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gray</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Chess</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Clark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Berner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mccandlish</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Amodei</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 34th International Conference on Neural Information Processing Systems, NIPS &apos;20</title>
				<meeting>the 34th International Conference on Neural Information Processing Systems, NIPS &apos;20<address><addrLine>Red Hook, NY, USA</addrLine></address></meeting>
		<imprint>
			<publisher>Curran Associates Inc</publisher>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
