<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">An analogy based framework for patient-stay identification in healthcare</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Safa</forename><surname>Alsaidi</surname></persName>
							<email>safa.alsaidi@inria.fr</email>
							<affiliation key="aff0">
								<orgName type="institution">Inria Paris</orgName>
								<address>
									<postCode>F-75012</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">Centre de Recherche des Cordeliers</orgName>
								<orgName type="institution" key="instit1">Inserm</orgName>
								<orgName type="institution" key="instit2">Université Paris Cité</orgName>
								<orgName type="institution" key="instit3">Sorbonne Université</orgName>
								<address>
									<postCode>F-75006</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Miguel</forename><surname>Couceiro</surname></persName>
							<email>miguel.couceiro@loria.fr</email>
							<affiliation key="aff2">
								<orgName type="laboratory">LORIA</orgName>
								<orgName type="institution" key="instit1">CNRS</orgName>
								<orgName type="institution" key="instit2">Universite de Lorraine</orgName>
								<address>
									<postCode>F-54000</postCode>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Esteban</forename><surname>Marquer</surname></persName>
							<email>esteban.marquer@loria.fr</email>
							<affiliation key="aff2">
								<orgName type="laboratory">LORIA</orgName>
								<orgName type="institution" key="instit1">CNRS</orgName>
								<orgName type="institution" key="instit2">Universite de Lorraine</orgName>
								<address>
									<postCode>F-54000</postCode>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sophie</forename><surname>Quennelle</surname></persName>
							<email>sophie.quennelle@inria.fr</email>
							<affiliation key="aff0">
								<orgName type="institution">Inria Paris</orgName>
								<address>
									<postCode>F-75012</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">Centre de Recherche des Cordeliers</orgName>
								<orgName type="institution" key="instit1">Inserm</orgName>
								<orgName type="institution" key="instit2">Université Paris Cité</orgName>
								<orgName type="institution" key="instit3">Sorbonne Université</orgName>
								<address>
									<postCode>F-75006</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Service d&apos;Informatique Biomédicale</orgName>
								<orgName type="institution" key="instit1">Hôpital Necker-Enfants Malades</orgName>
								<orgName type="institution" key="instit2">Assistance Publique -Hôpitaux de Paris</orgName>
								<address>
									<postCode>F-75015</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Anita</forename><surname>Burgun</surname></persName>
							<email>anita.burgun@aphp.fr</email>
							<affiliation key="aff0">
								<orgName type="institution">Inria Paris</orgName>
								<address>
									<postCode>F-75012</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">Centre de Recherche des Cordeliers</orgName>
								<orgName type="institution" key="instit1">Inserm</orgName>
								<orgName type="institution" key="instit2">Université Paris Cité</orgName>
								<orgName type="institution" key="instit3">Sorbonne Université</orgName>
								<address>
									<postCode>F-75006</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="institution">Imagine Institute</orgName>
								<address>
									<postCode>F-75015</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Service d&apos;Informatique Biomédicale</orgName>
								<orgName type="institution" key="instit1">Hôpital Necker-Enfants Malades</orgName>
								<orgName type="institution" key="instit2">Assistance Publique -Hôpitaux de Paris</orgName>
								<address>
									<postCode>F-75015</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Nicolas</forename><surname>Garcelon</surname></persName>
							<email>nicolas.garcelon@institutimagine.org</email>
							<affiliation key="aff0">
								<orgName type="institution">Inria Paris</orgName>
								<address>
									<postCode>F-75012</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">Centre de Recherche des Cordeliers</orgName>
								<orgName type="institution" key="instit1">Inserm</orgName>
								<orgName type="institution" key="instit2">Université Paris Cité</orgName>
								<orgName type="institution" key="instit3">Sorbonne Université</orgName>
								<address>
									<postCode>F-75006</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff3">
								<orgName type="institution">Imagine Institute</orgName>
								<address>
									<postCode>F-75015</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="department">Service d&apos;Informatique Biomédicale</orgName>
								<orgName type="institution" key="instit1">Hôpital Necker-Enfants Malades</orgName>
								<orgName type="institution" key="instit2">Assistance Publique -Hôpitaux de Paris</orgName>
								<address>
									<postCode>F-75015</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Adrien</forename><surname>Coulet</surname></persName>
							<email>adrien.coulet@inria.fr</email>
							<affiliation key="aff0">
								<orgName type="institution">Inria Paris</orgName>
								<address>
									<postCode>F-75012</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="laboratory">Centre de Recherche des Cordeliers</orgName>
								<orgName type="institution" key="instit1">Inserm</orgName>
								<orgName type="institution" key="instit2">Université Paris Cité</orgName>
								<orgName type="institution" key="instit3">Sorbonne Université</orgName>
								<address>
									<postCode>F-75006</postCode>
									<settlement>Paris</settlement>
									<country key="FR">France</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">An analogy based framework for patient-stay identification in healthcare</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">6E5814AEAD4D6B1FD5DF4EDC05E6FA2C</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-06-19T14:46+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>analogy classification, patient matching, electronic health records, patient representation learning, (A. Coulet) 0000-0002-4132-1068 (S. Alsaidi)</term>
					<term>0000-0003-2316-7623 (M. Couceiro)</term>
					<term>0000-0003-2315-7732 (E. Marquer)</term>
					<term>0000-0002-4782-6737 (S. Quennelle)</term>
					<term>0000-0001-6855-4366 (A. Burgun)</term>
					<term>0000-0002-3326-2811 (N. Garcelon)</term>
					<term>0000-0002-1466-062X (A. Coulet)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Analogical proportions are statements of the form "𝐴 is to 𝐵 as 𝐶 is to 𝐷". Analogies have been used in various reasoning and classification tasks, addressing different domains. Representation learning has enabled interesting progress in various analogy reasoning applications, where it focuses on the challenge of obtaining a vector representation of complex data. In the biomedical domain, representation learning has been adapted to patient data to solve various tasks such as predicting readmission, diagnosis, and length of stay. In this paper, we focus on the particular task of patient-stay identification, i.e., does a hospital stay belong to a patient or not? This constitutes a building block for addressing key biomedical tasks such as patient matching and privacy preservation. We propose a prototypical architecture that combines patient-stay representation learning and the analogical reasoning framework. For evaluation, we constitute sets of analogies from real-word Electronic Health Records, where objects are patient-stay representations learned from the data. We enrich our analogies using analogical properties and use them to train a neural model to detect whether an analogy is valid. We define three first experimental setups to address our task, present our empirical results, and discuss further perspectives.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>An analogical proportion, or simply analogy, is a quaternary relation involving four objects 𝐴, 𝐵, 𝐶, and 𝐷 that draws a parallel between the relation between 𝐴 and 𝐵 and the relation between 𝐶 and 𝐷, and that supports analogical reasoning. There are two common tasks associated with analogies, namely, analogy detection and analogy solving. Analogy detection aims at deciding whether a quadruple ⟨𝐴, 𝐵, 𝐶, 𝐷⟩ constitutes a valid analogy. Analogy solving aims at finding an 𝑥 that makes 𝐴 : 𝐵 :: 𝐶 : 𝑥 a valid analogy. Analogy reasoning has been applied to different Natural Language Processing (NLP) tasks such as mining paradigm tables in linguistics and image generation <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>.</p><p>Representation learning consists of learning low-dimension feature representations (i.e., embeddings) from data. These embeddings, or vector representations, of objects (i.e., words, images, characters, etc.) underpin much of modern machine learning and have demonstrated impressive performance on various downstream NLP tasks. For instance, Lim et al. <ref type="bibr" target="#b2">[3]</ref> proposed a deep learning model to tackle analogies using semantic embeddings. Their architecture integrates the characteristics of analogies by design and relies heavily on pretrained GloVe embeddings <ref type="bibr" target="#b3">[4]</ref>. These embeddings were not trained explicitly to find analogies; yet they were able to detect differences between objects. Hertzmann et al. <ref type="bibr" target="#b4">[5]</ref> proposed an analogical framework to learn "image filters" between a pair of images to create an "analogous" filtered result on a third image. The generated image 𝐷 should relate to 𝐶 in the same way as 𝐵 relates to 𝐴. Alsaidi et al. <ref type="bibr" target="#b5">[6]</ref> developed a neural approach and used character-based embeddings to detect morphological analogies between words.</p><p>Analogies have not been sufficiently exploited in healthcare, which thus motivates our work. However, practitioners unconsciously use analogical reasoning (i.e., medical reasoning) in their daily clinical practice to understand the possible causes for a disease diagnosis and prognosis by linking visible signs and symptoms that have been observed among different patients. In addition, several machine learning methods were applied to investigate analogies in healthcare. For instance, Casteleiro et al. <ref type="bibr" target="#b6">[7]</ref> utilized analogies to infer disease treatments from statements extracted from text. In their work, they try to extract biomedical facts by analogical reasoning from embeddings. Dynomant et al. <ref type="bibr" target="#b7">[8]</ref> used analogical proportions to compare embedding methods trained on a corpus of French health-related documents (i.e., discharge summary, procedure reports, and prescriptions). Analogical proportions were applied on the embeddings of medical documents to verify if (𝐴 ⃗ − 𝐵 ⃗ ) + 𝐶 ⃗ ≈ 𝐷 ⃗ , thus allowing to check whether the similarity between 𝐴 and 𝐵 is similar to the one between 𝐶 and 𝐷. An example of an analogical proportion they obtain is "(cardiologyheart) + lung ≈ pneumology." Rather et al. <ref type="bibr" target="#b8">[9]</ref> used analogical proportions to identify hidden or unknown biomedical knowledge from free text resources. They proposed analogical proportions of the form "acetaminophen is a type of drug as diabetes is a type of disease. "</p><p>In this paper, we aim to explore how the analogy framework can help in solving tasks relevant to the healthcare domain. We propose two models that learn patient-stay representations (i.e., learn a vector representation of all the patient EHR data collected during a single stay) to detect analogies in healthcare. To do so, we define two crucial steps that are (1) the learning of embeddings adapted to patient data, and (2) the definition of a neural network dedicated to learn formal properties of analogy. As for the network, we use the same model that was proposed by Lim et al. <ref type="bibr" target="#b2">[3]</ref> for word semantics, and later adapted by Alsaidi et al. <ref type="bibr" target="#b5">[6]</ref> by incorporating character-based embeddings for morphological analogies. We argue that the framework itself has the potential to be applied in a wide range of domains, and we propose to use it here for healthcare applications, namely, for the patient identification task we introduce below.</p><p>Electronic Health Records (EHRs) are real world healthcare data that have been used to train predictive models (including neural network models) for different biomedical tasks, e.g., predicting patient mortality, hospital readmission, length of stay, etc. These EHRs consist of clinical and administrative data collected during patient hospital stays in the form of both structured and unstructured data. Structured data generally includes diagnostic codes, lab tests, demographics, admission-related information, etc. It can be either static, e.g., patient demographics, or temporal, e.g., vital signs. Unstructured data includes various documents in natural language such as clinical notes, nursing reports, discharge summaries, lab reports, etc. For this work, we consider EHRs from the MIMIC-III (Medical Information Mart for Intensive Care, version 3) database <ref type="bibr" target="#b9">[10]</ref> to learn patient representations (i.e., patient embeddings) by converting patient data from the raw EHRs to embeddings that can be further processed. MIMIC-III is a free publicly available hospital database containing de-identified patient health data. This database has been widely used by researchers conducting data mining and machine learning studies applied to healthcare. Several neural network architectures have been developed to represent biomedical data. For instance, Si et al. <ref type="bibr" target="#b10">[11]</ref> adapted a multi-level CNN to learn patient representations from clinical notes through a multi-task learning framework to predict patient mortality and length of stay. Zhang et al. <ref type="bibr" target="#b11">[12]</ref> used GRU-based RNN to capture relationships between clinical events and employed attention mechanism to learn a personalized representation to predict patient's future hospitalization using EHR data. Madhumita et al. <ref type="bibr" target="#b12">[13]</ref> used a stacked denoising autoencoder and a paragraph vector model to learn generalized patient representations directly from clinical notes to predict patient mortality, primary diagnostic, procedural category, and patient gender. Zhang et al. <ref type="bibr" target="#b13">[14]</ref> proposed two neural network architectures that enhance patient representation learning by combining sequential unstructured notes with structured data and evaluated these representations on 3 risk evaluation tasks (i.e., in-hospital mortality, 30-day hospital readmission, and length of stay prediction). In our paper, we learn patient-stay representations and consider the task of patient-stay identification. We think that the tools that address this task will serve as building blocks for more complex and key biomedical tasks, such as patient matching and privacy preservation checking <ref type="bibr" target="#b14">[15,</ref><ref type="bibr" target="#b15">16]</ref>.</p><p>In this paper, we particularly propose to tackle this task by relying on the detection of analogies in healthcare. In Section 2, we define the setting of analogy that we work on. The models we propose to detect analogies are described in Section 3, along with the procedures we use for data augmentation, training, and evaluation. In Section 4, we provide a description of the MIMIC-III dataset and detail how we build our experimental dataset. We present our experiments and report our results in Section 5. In Section 6, we discuss perspectives for future research.</p><p>The main contributions of this paper are the following:</p><p>• we propose an analogy based setting using patient-stay representations;</p><p>• we propose an embedding model to learn patient-stay representations;</p><p>• we display the performance of our classification model to detect analogies on patient-stay data.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Defining the task</head><p>As we defined previously, an analogy is a 4-ary relation written as 𝐴 : 𝐵 :: 𝐶 : 𝐷 and expressed as "𝐴 is to 𝐵 as 𝐶 is to 𝐷". In this paper, we work on patient-stay analogies, i.e., on analogies involving hospital stay. In our setting, 𝐴, 𝐵, 𝐶, and 𝐷 represent patient-stay representations.</p><p>We define an analogy based setting on patient-stay data that we refer to as Identity setting.</p><p>For that, we consider patient-stay representations, which are vector representations of EHR data that belong to a single hospital stay. Based on the type of EHR data that we decide to include, our patient-stay representations can be made of a representation of either structured or unstructured data, or they can be made of the concatenation of both types of data. More details are provided in Section 5. For this setting, we propose to build analogies of the form:</p><formula xml:id="formula_0">𝑠 𝑖 1 𝑡 1 : 𝑠 𝑖 1 𝑡 2 :: 𝑠 𝑖 2 𝑡 3 : 𝑠 𝑖 2 𝑡 4</formula><p>where 𝑠 𝑖 𝑡 refers to the stay 𝑡 of patient 𝑖. Here, pairs of the analogy quadruples are made of two random stays belonging to the same patient. Since there is no constraint on the order of stays, 𝑠 𝑖 1 𝑡 1 can happen before 𝑠 𝑖 1 𝑡 2 or the inverse. Note that 𝑖 1 and 𝑖 2 can be the same patient, and that 𝑡 1 and 𝑡 2 , or 𝑡 3 and 𝑡 4 , can represent the same time stamp. Furthermore, 𝑡 1 and 𝑡 3 or 𝑡 2 and 𝑡 4 can be the same when 𝑖 1 = 𝑖 2 (but not when 𝑖 1 ̸ = 𝑖 2 ). The Identity setting finds applications in several tasks relevant to biomedical informatics, including:</p><p>• data cleaning, • data privacy related application, • patient matching.</p><p>Data cleaning applications in the health domain involve repairing or removing patient health data that is inaccurate, incorrectly structured, duplicative, or incomplete. In data cleaning applications, we can associate an erroneously affected sample of data to the patient it belongs. Privacy related applications include verifying if patient data is de-identified, and whether it can be re-identified using different systems. Patient matching is defined as the identification and linking of one patient's data within and across health databases in order to obtain a comprehensive view of that patient's health care record <ref type="bibr" target="#b16">[17]</ref>. In patient matching, we try to match patient-related information, either a single patient data (e.g., a document) or full EHR data, that can coexist in one or several databases.</p><p>In this paper, we try to match patient-stay representations to the patient they belong to. We focus on the task of patient-stay identification, where we aim to determine if a particular hospital stay belongs to a certain patient. We address this task by learning a model to classify such quadruples into valid and invalid analogies. In this sense, we implement the task of analogy detection that aims to determine if a quadruple is a valid analogy. For our Identity setting, we define a valid analogy as a quadruple of four stays</p><formula xml:id="formula_1">(𝑠 𝑖 1 𝑡 1 , 𝑠 𝑖 1 𝑡 2 , 𝑠 𝑖 2 𝑡 3 , 𝑠 𝑖 2 𝑡 4 )</formula><p>, where each pair of two stays belong to a single patient 𝑖 𝑗 ; other forms of analogies are considered invalid.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Proposed Approach</head><p>Our model is made of two components: an embedding model and a classification model. The second takes as input patient-stay representations computed by the first (see Section 3.1).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>ICCBR'22 Workshop Proceedings</head><p>Our embedding model is trained along with the classification model. We also detail the data augmentation procedure in Section 3.2, and describe the training and evaluation protocols that we followed in Section 3.3.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Embedding and Classification Models</head><p>The models described in this subsection are schematized in Figure <ref type="figure" target="#fig_0">1</ref>. Classification Model. As in Alsaidi et al. <ref type="bibr" target="#b5">[6]</ref>, we adapt the neural architecture in Lim et al. <ref type="bibr" target="#b2">[3]</ref> to our patient-stay setting. Our classification model determines if an analogy 𝐴 : 𝐵 :: 𝐶 : 𝐷 is valid by verifying if 𝐴 and 𝐵 differ in the same way as 𝐶 and 𝐷. The architecture of the classification model is a Convolutional Neural Network (CNN), which takes as input the embeddings of size 𝑛 of four elements 𝐴, 𝐵, 𝐶, 𝐷. We stack them to get a matrix of size 𝑛 × 4. The CNN is made of three layers as depicted in the right frame of Figure <ref type="figure" target="#fig_0">1</ref>. The first convolutional layer with 128 filters of 1 by 2 is applied on the embeddings, such that it analyses each pair separately without overlaps and measures how 𝐴 and 𝐵, and how 𝐶 and 𝐷 differ for each component. The second convolutional layer with 64 filters of 2 by 2 is applied on the resulting matrix, after which the result is flattened into a 64 × (𝑛 − 1) unidimensional vector and used as input of a fully connected dense layer that produces a single output. The second layer aims at checking if the difference between 𝐴 and 𝐵 is the same as the one between 𝐶 and 𝐷. If 𝐴 and 𝐵 are different in the same way as 𝐶 and 𝐷, then 𝐴 : 𝐵 :: 𝐶 : 𝐷 is a valid analogy. The last layer aggregates this information using a sigmoid activation to get a result (i.e., output of the classification model) between 0 (for invalid analogies) and 1 (for valid analogies). All layers, except the last one, use Regularized Linear Unit (ReLU) as activation function.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2.">Data Augmentation</head><p>Deep neural network approaches require large amounts of data. Therefore we took advantage of properties of analogies to produce additional proportions based on our dataset in a process called data augmentation. Previous works <ref type="bibr" target="#b18">[19,</ref><ref type="bibr" target="#b19">20,</ref><ref type="bibr" target="#b20">21]</ref> have proposed postulates that analogies should obey. For this study, we consider the following: Based on the definition of our analogical setting, we can apply all the above-mentioned postulates to generate valid analogical proportions except for central permutation, which can only be applied in the very particular case when 𝑖 1 = 𝑖 2 . When 𝑖 1 ̸ = 𝑖 2 , central permutation cannot be applied to increase our dataset as it would enable to associate stays of distinct patients, which is inconsistent with the aim of the Identity setting. Note that from reflexivity and central permutation we can deduce inner reflexivity. As reflexivity forces 𝑖 1 = 𝑖 2 , applying it in cases where 𝑖 1 ̸ = 𝑖 2 would result in a case where 𝑖 1 = 𝑖 2 .</p><p>For the cases where 𝑖 1 ̸ = 𝑖 2 , given a valid analogy we can generate eight additional valid analogical proportions, namely For cases where 𝑖 1 = 𝑖 2 , we apply reflexivity to generate one more valid analogical proportion, namely 𝐴 : 𝐵 :: 𝐴 : 𝐵. Note that for cases where 𝑖 1 = 𝑖 2 , invalid analogical proportions would be considered valid.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.">Training and Evaluation</head><p>As mentioned, we define a valid analogy as a quadruple of four stays</p><formula xml:id="formula_2">(𝑠 𝑖 1 𝑡 1 , 𝑠 𝑖 1 𝑡 2 , 𝑠 𝑖 2 𝑡 3 , 𝑠 𝑖 2 𝑡 4 )</formula><p>, where each pair of two stays belong to a single patient 𝑖 𝑗 . For each analogy in the dataset, we start by embedding the four stays. We augment the embeddings using the postulates that we recalled in Section 3.2. As a result, we generate 9 valid analogical proportions (i.e., positive examples) and 2 invalid analogical proportions for cases where 𝑖 1 ̸ = 𝑖 2 . For cases where 𝑖 1 = 𝑖 2 , we obtain 10 + 2 = 12 valid analogical proportions and no invalid analogical proportions. For optimization, we use the Binary Cross-Entropy (BCE) loss. To evaluate the classification model we use the same data augmentation process as for training, and we compute the accuracy and F1 score.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Dataset description</head><p>For our experiments, we used EHRs from the MIMIC-III <ref type="bibr" target="#b9">[10]</ref> as a source of patient medical history data. MIMIC-III is a critical care database developed by the Massachusetts Institute of Technology (MIT)'s Laboratory for Computational Physiology and distributed by PhysioNet <ref type="bibr" target="#b21">[22]</ref>. The database is publicly available, where it is accessible to researchers after finishing a HIPAA training course demanded by the National Institutes of Health (NIH). The database contains health-related information associated with all patients admitted to the ICU (Intensive Care Unit) of Beth Israel Deaconess Medical Center between the years 2001 and 2012. It encompasses data of more than 40,000 ICU patients with more than 60,000 ICU stays. All patients' data has been de-identifed in accordance with Health Insurance Portability and Accountability Act (HIPAA). The dataset contains various types of data such as patient demographics, vital signs, lab test results, medications, hospital length of stay, procedures, clinical notes, diagnosis codes (ICD-9), imaging reports, etc.</p><p>To build our dataset, we keep only adult patients (i.e., patients aged 18 and above) with at least two admissions. As we do not define any order constraint, we obtain all the permutations of all the stays belonging to a patient. We organize our dataset in way where each pair of stay is associated to the patient it belongs to: ⟨𝑆 1 , 𝑆 2 , 𝑃 𝐴𝑇 𝐼𝐸𝑁 𝑇 _𝐼𝐷⟩, where 𝑆 1 corresponds to 𝑠 𝑖 1 𝑡 1 , 𝑆 2 corresponds to 𝑠 𝑖 1 𝑡 2 , and the associated 𝑃 𝐴𝑇 𝐼𝐸𝑁 𝑇 _𝐼𝐷 that represents 𝑖 1 . We obtain a dataset made of 46,986 triples, where for each two pairs of stays we produce an analogy. For our experiments, we use all hospital stays associated with randomly selected 200 patients. We use the data augmentation process to generate positive and negative examples. For training and evaluation, we perform a random split (using a fixed random seed) in a training set of 70% of the extracted analogies, the remaining 30% serving as the test set. We end up with 939,638 analogies for training and 402,703 for testing. To maintain reasonable training and evaluation time, we randomly selected 50,000 analogies from the training set and 50,000 analogies from the testing set.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Experiment Setup</head><p>We now present the three experiments that we conducted in the Identity setting. In Section 5.1, we describe the patient-stay features that we consider and the data preprocessing that we performed for structured and unstructured data. We describe the implementation details in Section 5.2. The results of our experiments are reported in Section 5.3 and discussed further in this section. The code used for our experiments is written in Python 3.9 and PyTorch and is available in the repository https://github.com/Safa-98/patient-stay-analogy.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.1.">Stay Features and Data Preprocessing</head><p>We consider both structured (i.e., demographics and admission-related information) and unstructured data (i.e., clinical notes) to define our analogies. In this subsection, we describe the patient-stay features that are utilized by our model and some data preprocessing details.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Static information.</head><p>In our experiments, our static information consists of demographic information and admission-related information. For demographic information, we extract patient's age, gender, marital status, ethnicity, and insurance information. We keep only adult patients (i.e., patients aged 18 and above). We split the age into 5 groups <ref type="bibr">[18, 25[, [25, 45[, [45, 65[, [65, 89[ and [89, +∞[</ref>. For admission-related information, we include admission type as features.</p><p>Clinical notes. Nursing, Nursing/Other, Physician, and Radiology notes make up the majority of clinical notes in MIMIC-III database. For each hospital stay, we only kept notes that belong to these 4 categories. We excluded notes that have an error tag and notes that lack a hospital admission id.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.2.">Implementation Details</head><p>To build our corresponding cohorts, we performed the preprocessing described in the previous section to obtain our patient-stay features. Patients without any records of clinical notes or with notes that do not belong to the 4 categories defined above were removed. We computed the median of notes per hospital admission to determine the number of clinical notes to extract For the unsupervised Doc2Vec model <ref type="bibr" target="#b17">[18]</ref>, we finetune it on the training set to obtain the document-level embeddings using the Gensim toolkit <ref type="bibr" target="#b22">[23]</ref>. For the training algorithm, we use PV-DBOW (Paragraph vector-Distributed Bag of Words). We set the number of training epochs as 30, the initial learning rate as 0.025, the learning rate decay as 0.0002, and the dimension of vectors as 200 to train. The Fusion CNN model is trained with Adam optimizer with a learning rate of 0.0001 and ReLU as the activation function. The chosen batch size is 64.</p><p>In this paper we perform three experiments. In the first, we consider both structured and unstructured data. Therefore, we obtain our patient-stay representation by concatenating the representations of clinical notes along with static information. In this experiment, we verify if a particular hospital stay belongs to a patient by looking at both the structured and unstructured data associated with each stay. In the second, we only consider unstructured data, which means that our patient-stay representations are based solely on the representations of clinical notes. Therefore, by looking at clinical notes associated with a single hospital stay, we check if a particular hospital stay belongs to a patient. In the third, we only consider structured data, which means that our patient-stay representations are based solely on the representations of static information (i.e., demographics and admission-related information). Therefore, we verify if a particular hospital stay belongs to a patient by looking at the static information that is associated with a hospital stay.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.3.">Results and Discussion</head><p>As mentioned previously, we conducted three experiments that mainly differ in what type of data was used to obtain our patient-stay representations. For all the experiments, we used 50,000 analogies for training and evaluation, and applied the same procedure for data augmentation. We report the accuracy and F1 score for each experiment. The F1 score gives a better measure of the incorrectly classified cases than the accuracy metric.</p><p>For the first experiment, we fed our embedding model with both structured (i.e., demographics and admission-related information) and unstructured data (i.e., clinical notes). Our patient-stay representations are thus made of the concatenation of static information and clinical notes. We chose the epochs where the training loss is at the local minimum. We trained our model for 10, 20, and 40 epochs, with 3 different random initializations in each case. Our results are detailed in Table <ref type="table" target="#tab_0">1</ref>. Our model performs the best for positive examples. For 40 epochs, the model gives the best result for valid analogies and performs best for invalid analogies for 20 epochs.</p><p>To gain more insight into how our models perform, we conducted an error analysis where we noticed that most misclassifications were spotted in two cases.</p><p>1. Cases where 𝑖 1 = 𝑖 2 .</p><p>To recall, we do not generate invalid analogies for cases where 𝑖 1 = 𝑖 2 ; therefore, invalid analogy forms (𝐷 : 𝐴 :: 𝐵 : 𝐶 and 𝐴 : 𝐶 :: 𝐵 : 𝐷) should be considered valid in these cases. In our error analysis, we noticed that when the four stays belong to the same patient, our model classifies the above-mentioned invalid analogy forms as invalid instead of valid. We believe that our model was not trained enough to distinguish these forms of analogies as there were less analogies with four stays belonging to the same patient generated in our dataset. 2. Cases where representations are made of only clinical notes.</p><p>To recall, in our second experiment we only used the representations of clinical notes to obtain patient-stay representations. We noticed that when the category of the clinical notes is similar between two hospital stays or when two hospital stays have less than five clinical notes, our model struggles to distinguish between the two hospital stays. This indicates that in some cases using only clinical notes to learn patient-stay representations might not be sufficient as these notes might not contain enough information to help our model differentiate between two similar stays that belong to two distinct patients. As a result, the model would incorrectly match these two similar stays to the same patient.</p><p>In these experiments, we did not include temporal data, where we only used demographics and admission-related information as structured data. It would be interesting to also include temporal signals (i.e., vital signs) along with demographics and admission-related information as structured data. Our patient-stay representations would be then made of the concatenation of the representations of static information and temporal signals as structured data and the representation of clinical notes as unstructured data.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.">Conclusion and Perspectives</head><p>We adapted the approach in <ref type="bibr" target="#b2">[3,</ref><ref type="bibr" target="#b5">6]</ref> from semantic and morphological analogies to patient-stay analogies. Our prototypical architecture has some limits, but seems promising for the task of patient identification. Our classification model is flexible in terms of the analogies that it classifies. Changing the way the data is augmented will change the way the model behaves. Our model can be adapted to different healthcare applications through dedicated embedding models <ref type="bibr" target="#b23">[24]</ref>. Inspired by <ref type="bibr" target="#b13">[14]</ref>, we implemented a model to build patient-stay representations. As mentioned in Section 5.3, there are multiple plausible improvements to our approach, in terms of balancing valid and invalid analogies as well as including other types of data to build our patient-stay representations. As we limited ourselves to analogy detection, a future work would be to address analogy solving in the same setting that would allow the generation of synthetic patient-stays.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: The Fusion CNN embedding model and the CNN classification model.</figDesc><graphic coords="5,89.29,186.50,416.64,154.18" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>•</head><label></label><figDesc>𝐴 : 𝐵 :: 𝐴 : 𝐵 (reflexivity); • 𝐴 : 𝐴 :: 𝐶 : 𝐶 (inner reflexivity); • 𝐴 : 𝐵 :: 𝐶 : 𝐷 → 𝐶 : 𝐷 :: 𝐴 : 𝐵 (symmetry); • 𝐴 : 𝐵 :: 𝐶 : 𝐷 → 𝐵 : 𝐴 :: 𝐷 : 𝐶 (inner symmetry); • 𝐴 : 𝐵 :: 𝐶 : 𝐷 → 𝐴 : 𝐶 :: 𝐵 : 𝐷 (central permutation).</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>•</head><label></label><figDesc>𝐶 : 𝐷 :: 𝐴 : 𝐵, • 𝐷 : 𝐶 :: 𝐵 : 𝐴, • 𝐵 : 𝐴 :: 𝐷 : 𝐶, • 𝐴 : 𝐴 :: 𝐶 : 𝐶, • 𝐵 : 𝐴 :: 𝐶 : 𝐷, • 𝐴 : 𝐵 :: 𝐷 : 𝐶, • 𝐶 : 𝐷 :: 𝐵 : 𝐴, • 𝐷 : 𝐶 :: 𝐴 : 𝐵; and two invalid analogical proportions, namely • 𝐷 : 𝐴 :: 𝐵 : 𝐶 and • 𝐴 : 𝐶 :: 𝐵 : 𝐷.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 1</head><label>1</label><figDesc>Accuracy and F1 score (both in %) of 3 runs of the classification model. Embeddings used are concatenation of static information and clinical notes.</figDesc><table><row><cell>Epochs</cell><cell>Valid</cell><cell>Invalid</cell><cell>F1</cell></row><row><cell cols="4">40 epochs 98.41 ± 1.56 68.22 ± 1.94 95.79 ± 0.59</cell></row><row><cell cols="4">20 epochs 94.89 ± 1.74 72.08 ± 1.68 94.30 ± 0.80</cell></row><row><cell cols="4">10 epochs 96.85 ± 1.75 70.31 ± 1.94 95.20 ± 0.71</cell></row><row><cell cols="4">per hospital admission. Therefore, we kept the first 12 notes, and used padding (i.e., completion</cell></row><row><cell cols="3">with zeros) for hospital admissions with less than 12 notes.</cell><cell></cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>Experiments presented in this paper were carried out using computational clusters equipped with GPU from the Grid'5000 testbed (see https://www.grid5000.fr).</p><p>The research work of the second and third named authors is partially supported by TAILOR, a project funded by EU Horizon 2020 research and innovation program under GA No 952215, and the Inria Project Lab "Hybrid Approaches for Interpretable AI" (HyAIAI).</p></div>
			</div>

			<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><p>For our second experiment, we used only unstructured data, i.e., the 𝑍 𝑛𝑜𝑡𝑒 part of the embedding for the patient-stay representations. Our patient-stay representations thus consisted of only the representation of clinical notes. The training loss was at the local minimum for 15, 20, and 40 epochs. Therefore, we trained our model for 15, 20, and 40 epochs, with 3 different random initializations in each case. As shown in Table <ref type="table">2</ref>, our model performs the best for positive examples when we train by 20 epochs.</p><p>For our third experiment, we used only structured data, i.e., 𝑍 𝑠𝑡𝑎𝑡𝑖𝑐 , to represent our patientstay representations. Our training loss was at the local minimum for 15, 20, and 40 epochs. Therefore, we trained our model for 15, 20, and 40 epochs, with 3 different random initializations in each case. We report our results in Table <ref type="table">3</ref>. As seen, the accuracy for positive examples is high for all cases compared to negative examples where the accuracy drops.</p><p>In all our experiments, we can see that our model performs the best for positive examples regardless of whether we use [𝑍 𝑠𝑡𝑎𝑡𝑖𝑐 ; 𝑍 𝑛𝑜𝑡𝑒 ], only 𝑍 𝑛𝑜𝑡𝑒 , or only 𝑍 𝑠𝑡𝑎𝑡𝑖𝑐 for the patient-stay representations. This can be explained as a result of the imbalance between positive and negative examples in the training data. Balancing the data would be the next step as it proved to be a good solution for <ref type="bibr" target="#b5">[6]</ref> to get similar results for positive and negative examples. The accuracy for valid analogies is the highest when our embedding model is fed with only static information. Between the first and the second experiment, the accuracy is the highest for valid analogies when the patient-stay representations are made of the concatenation in contrast to when our patient-stay representations are made of only clinical notes. This indicates that adding or using static information when learning patient-stay representations, as in the first and third experiment, improves the performance of our model, where it allows the model to better distinguish the stays and to match them to the patient they belong to. We also notice that the accuracy for invalid analogies is the highest when the embedding model is fed with only clinical notes. For all performed experiments, the F1 score is high, which indicates that our model is able to correctly classify analogies to the class they belong to (i.e., valid or invalid).</p></div>			</div>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Morphological predictability of unseen words using computational analogy</title>
		<author>
			<persName><forename type="first">R</forename><surname>Fam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lepage</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Workshops Proceedings for the Twenty-fourth International Conference on Case-Based Reasoning (ICCBR)</title>
				<imprint>
			<date type="published" when="2016">2016</date>
			<biblScope unit="volume">1815</biblScope>
			<biblScope unit="page" from="51" to="60" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Deep visual analogy-making</title>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">E</forename><surname>Reed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Lee</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)</title>
				<meeting>the Advances in Neural Information Processing Systems (NeurIPS)</meeting>
		<imprint>
			<date type="published" when="2015">2015</date>
			<biblScope unit="page" from="1252" to="1260" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Solving word analogies: A machine learning perspective</title>
		<author>
			<persName><forename type="first">S</forename><surname>Lim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Prade</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Richard</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Symbolic and Quantitative Approaches to Reasoning with Uncertainty (ECSQARU)</title>
				<meeting>the Symbolic and Quantitative Approaches to Reasoning with Uncertainty (ECSQARU)</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="volume">11726</biblScope>
			<biblScope unit="page" from="238" to="250" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Glove: Global vectors for word representation</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pennington</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Socher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">D</forename><surname>Manning</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Empirical Methods in Natural Language Processing (EMNLP)</title>
				<meeting>the Empirical Methods in Natural Language Processing (EMNLP)</meeting>
		<imprint>
			<date type="published" when="2014">2014</date>
			<biblScope unit="page" from="1532" to="1543" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Image analogies</title>
		<author>
			<persName><forename type="first">A</forename><surname>Hertzmann</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">E</forename><surname>Jacobs</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Oliver</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Curless</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Salesin</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 28th Annual Conference on Computer Graphics and Interactive Techniques (SIGGRAPH)</title>
				<meeting>the 28th Annual Conference on Computer Graphics and Interactive Techniques (SIGGRAPH)</meeting>
		<imprint>
			<date type="published" when="2001">2001</date>
			<biblScope unit="page" from="327" to="340" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">A neural approach for detecting morphological analogies</title>
		<author>
			<persName><forename type="first">S</forename><surname>Alsaidi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Decker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Lay</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Marquer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P.-A</forename><surname>Murena</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Couceiro</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 8th IEEE International Conference on Data Science and Advanced Analytics (DSAA)</title>
				<meeting>the 8th IEEE International Conference on Data Science and Advanced Analytics (DSAA)</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="1" to="10" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Semantic deep learning: Prior knowledge and a type of fourterm embedding analogy to acquire treatments for well-known diseases</title>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Casteleiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">D</forename><surname>Diz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Maroto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">J F</forename><surname>Prieto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Peters</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Wroe</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">S</forename><surname>Torrado</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">M</forename><surname>Fernandez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Stevens</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">JMIR Medical Informatics</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<biblScope unit="page" from="1" to="28" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Word embedding for the french natural language in health care: comparative study</title>
		<author>
			<persName><forename type="first">E</forename><surname>Dynomant</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Lelong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Dahamna</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Massonnaud</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Kerdelhué</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Grosjean</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Canu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">J</forename><surname>Darmoni</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">JMIR medical informatics</title>
		<imprint>
			<biblScope unit="volume">7</biblScope>
			<biblScope unit="page" from="118" to="122" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Using deep learning towards biomedical knowledge discovery</title>
		<author>
			<persName><forename type="first">N</forename><forename type="middle">N</forename><surname>Rather</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Patel</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">A</forename><surname>Khan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">International Journal of Mathematical Sciences and Computing</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="1" to="10" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
	<note>IJMSC)</note>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Mimic-iii, a freely accessible critical care database</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">E W</forename><surname>Johnson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">J</forename><surname>Pollard</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wei H. Lehman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">M</forename><surname>Ghassemi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Moody</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Szolovits</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">A</forename><surname>Celi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Mark</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Scientific Data</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Deep representation learning of patient data from electronic health records (ehr): A systematic review</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Si</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Du</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Miller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Jim Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Roberts</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Biomedical Informatics</title>
		<imprint>
			<biblScope unit="volume">115</biblScope>
			<biblScope unit="page" from="1" to="42" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Patient2vec: A personalized interpretable deep representation of the longitudinal electronic health record</title>
		<author>
			<persName><forename type="first">J</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Kowsari</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">H</forename><surname>Harrison</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Lobo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">E</forename><surname>Barnes</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Access</title>
		<imprint>
			<biblScope unit="volume">6</biblScope>
			<biblScope unit="page" from="65333" to="65346" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Patient representation learning and interpretable evaluation using clinical notes</title>
		<author>
			<persName><forename type="first">S</forename><surname>Madhumita</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Simon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Walter</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of biomedical informatics</title>
		<imprint>
			<biblScope unit="volume">84</biblScope>
			<biblScope unit="page" from="103" to="113" />
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Combining structured and unstructured data for predictive models: a deep learning approach</title>
		<author>
			<persName><forename type="first">D</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Yin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Zeng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Yuan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">BMC Medical Informatics and Decision Making</title>
		<imprint>
			<biblScope unit="volume">20</biblScope>
			<biblScope unit="page">280</biblScope>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">A review of current patient matching techniques</title>
		<author>
			<persName><forename type="first">P</forename><surname>Waruhari</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Babic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Nderu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">C</forename><surname>Were</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Informatics Empowers Healthcare Transformation (ICIMTH)</title>
		<imprint>
			<biblScope unit="volume">238</biblScope>
			<biblScope unit="page" from="205" to="208" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Privacy-preserving data sharing infrastructures for medical research: systematization and comparison</title>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">N</forename><surname>Wirth</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Meurers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Johns</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Prasser</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">BMC Medical Informatics Decision Making</title>
		<imprint>
			<biblScope unit="volume">21</biblScope>
			<biblScope unit="page">242</biblScope>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Why patient matching is a challenge: Research on master patient index (mpi) data discrepancies in key identifying fields</title>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">H</forename><surname>Just</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">T</forename><surname>Marc</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Munns</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">H</forename><surname>Sandefer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Perspectives in health information management</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="page">1</biblScope>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">Distributed representations of sentences and documents</title>
		<author>
			<persName><forename type="first">Q</forename><forename type="middle">V</forename><surname>Le</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Mikolov</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 31th International Conference on Machine Learning (ICML)</title>
				<meeting>the 31th International Conference on Machine Learning (ICML)</meeting>
		<imprint>
			<date type="published" when="2014">2014</date>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="1188" to="1196" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Analogical dissimilarity: Definition, algorithms and two experiments in machine learning</title>
		<author>
			<persName><forename type="first">L</forename><surname>Miclet</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bayoudh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Delhay</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Artificial Intelligence Research</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="793" to="824" />
			<date type="published" when="2008">2008</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">Y</forename><surname>Lepage</surname></persName>
		</author>
		<title level="m">De l&apos;analogie rendant compte de la commutation en linguistique</title>
				<imprint>
			<date type="published" when="2003">2003</date>
		</imprint>
		<respStmt>
			<orgName>Universit&apos;e Joseph-Fourier -Grenoble I</orgName>
		</respStmt>
	</monogr>
	<note type="report_type">Habilitation à diriger des recherches</note>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<author>
			<persName><forename type="first">C</forename><surname>Antic</surname></persName>
		</author>
		<idno>ArXiv abs/2006.02854</idno>
		<title level="m">Analogical proportions</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Physiobank, physiotoolkit, and physionet: components of a new research resource for complex physiologic signals</title>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">L</forename><surname>Goldberger</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">A N</forename><surname>Amaral</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Glass</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">M</forename><surname>Hausdorff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">C</forename><surname>Ivanov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">G</forename><surname>Mark</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">E</forename><surname>Mietus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">B</forename><surname>Moody</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C.-K</forename><surname>Peng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><forename type="middle">E</forename><surname>Stanley</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Circulation</title>
		<imprint>
			<biblScope unit="volume">101</biblScope>
			<biblScope unit="page" from="E215" to="220" />
			<date type="published" when="2000">2000</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Software Framework for Topic Modelling with Large Corpora</title>
		<author>
			<persName><forename type="first">R</forename><surname>Řehůřek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Sojka</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the LREC Workshop on New Challenges for NLP Frameworks</title>
				<meeting>the LREC Workshop on New Challenges for NLP Frameworks</meeting>
		<imprint>
			<date type="published" when="2010">2010</date>
			<biblScope unit="page" from="45" to="50" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Exploring analogical inference in healthcare</title>
		<author>
			<persName><forename type="first">S</forename><surname>Alsaidi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Couceiro</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Burgun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Garcelon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Coulet</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Workshop on Interactions between Analogical Reasoning and Machine Learning (IARML)</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note>to appear</note>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
