<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Development of an Adverse Drug Reaction Corpus from Consumer Health Posts</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><roleName>MS, PhD Candidate</roleName><forename type="first">Maryam</forename><surname>Zolnoori</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Health Sciences</orgName>
								<orgName type="institution">University of Wisconsin Milwaukee</orgName>
								<address>
									<settlement>Milwaukee</settlement>
									<region>WI</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>MS, PhD</roleName><forename type="first">Timothy</forename><forename type="middle">B</forename><surname>Patrick</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Health Sciences</orgName>
								<orgName type="institution">University of Wisconsin Milwaukee</orgName>
								<address>
									<settlement>Milwaukee</settlement>
									<region>WI</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>MD</roleName><forename type="first">Kin</forename><forename type="middle">Wah</forename><surname>Fung</surname></persName>
							<affiliation key="aff1">
								<orgName type="institution">National Library of Medicine</orgName>
								<address>
									<settlement>Bethesda</settlement>
									<region>MD</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>MD, MPH</roleName><forename type="first">Paul</forename><surname>Fontelo</surname></persName>
							<affiliation key="aff1">
								<orgName type="institution">National Library of Medicine</orgName>
								<address>
									<settlement>Bethesda</settlement>
									<region>MD</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>PhD</roleName><forename type="first">Anthony</forename><surname>Faiola</surname></persName>
							<affiliation key="aff2">
								<orgName type="department">Department of Biomedical and Health Information Sciences</orgName>
								<orgName type="institution">University of Illinois at Chicago</orgName>
								<address>
									<settlement>Chicago</settlement>
									<region>IL</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Yi</forename><surname>Shuan</surname></persName>
						</author>
						<author>
							<persName><roleName>PharmD Candidate</roleName><forename type="first">Shirley</forename><surname>Wu</surname></persName>
						</author>
						<author>
							<persName><roleName>PharmD Candidate</roleName><forename type="first">Kelly</forename><surname>Xu</surname></persName>
							<affiliation key="aff3">
								<orgName type="department">School of Pharmacy</orgName>
								<orgName type="institution">University of Pittsburgh</orgName>
								<address>
									<settlement>Pittsburgh</settlement>
									<region>PA</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>MS</roleName><forename type="first">Jiaxi</forename><surname>Zhu</surname></persName>
							<affiliation key="aff3">
								<orgName type="department">School of Pharmacy</orgName>
								<orgName type="institution">University of Pittsburgh</orgName>
								<address>
									<settlement>Pittsburgh</settlement>
									<region>PA</region>
								</address>
							</affiliation>
							<affiliation key="aff4">
								<orgName type="institution">Emmes Corporation</orgName>
								<address>
									<settlement>Rockville</settlement>
									<region>MD</region>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><roleName>MD, PhD Candidate</roleName><forename type="first">Christina</forename><forename type="middle">E</forename><surname>Eldredge</surname></persName>
							<affiliation key="aff5">
								<orgName type="department">School of Information</orgName>
								<orgName type="institution">University of South Florida</orgName>
								<address>
									<settlement>Tampa</settlement>
									<region>FL</region>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Development of an Adverse Drug Reaction Corpus from Consumer Health Posts</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">CC26288385024B9AB211573106C7104D</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-23T20:08+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>UWM-Adverse Drug Events Corpus (UWM-ADEC</head><p>) is an annotated corpus that has been developed from consumer drug review posts in social media. In this corpus, we identified four types of Adverse Drug Reactions (ADRs) including physiological, psychological, cognitive, and functional problems. Additionally, we mapped the ADRs to corresponding concepts in Unified medical language Systems (UMLS). The quality of the corpus was measured using well-defined guidelines, double coding, high inter-annotator agreement, and final reviews by pharmacists and clinical terminologists. This corpus is a valuable source for research in the area of text mining and machine learning for ADRs identifications from consumer health posts, specifically for psychiatric medications.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Introduction</head><p>Clinical trials and post-marketing surveillance systems established by regulatory agencies, such as the Adverse Event Reporting System (AERS) of the Food and Drug Administration (FDA), are not sensitive enough to detect the potential risks of drugs before marketing and moreover the occurrence of potential adverse drug reactions (ADRs) after wider use in patients. It is estimated that current surveillance systems capture less than 10% of the ADRs occurrence, due to voluntary nature of data collection and perhaps, patients' negative perceptions of the reporting systems <ref type="bibr" target="#b13">(Yang, Yang, Jiang, &amp; Zhang, 2012)</ref>. These limitations have led to major concerns in public health because of recent reports thousands of incidents of hospitalizations and deaths <ref type="bibr" target="#b5">(Karimi, Metke-Jimenez, Kemp, &amp; Wang, 2015)</ref>.</p><p>Recent studies have shown the potential significance of using consumer health posts from social media as a supplementary health data source to improve identifying ADRs. Therefore, regulatory agencies such as the FDA's Sentinel Initiative, have considered this source for actively monitoring for ADRs. However, there are challenges to automatic extraction of ADRs from social media, such as colloquial expressions of ADRs and deviation of sentence/phrase structure from formal sentence/phrase structure. These deviations can significantly reduce recall and precision of the automatic extraction of ADRs from consumer health posts.</p><p>A human annotated corpus can significantly improve performance of computerized systems aimed to identify health entities from unstructured consumer health posts. Development of such corpus is a very costly process. In line with the needs of improving performance of text mining algorithms in the area of pharmacovigilance, we developed a corpus of ADRs from a healthcare forum called "askapatient.com", which collects drug reviews from patients. We extracted ADRs from the review posts in this forum and mapped them to their corresponding terms in Unified medical language Systems (UMLS). To our knowledge, this corpus is the first corpus that covers a wide-range of ADRs associated with psychiatric medications, including physiological, psychological, cognitive, and functional adverse reactions.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Background</head><p>The lexicon-based approach for name entity recognition in the area of pharmacovigilance currently dominates other methods of health entity extraction in consumer health posts. The lexicons have been mostly developed by combining standard medical vocabularies including COSTART (that was developed by the FDA for coding postmarketing ADR reports and was later replaced by MedDRA), the FDA Adverse Event Reporting System (FAERS), MedEffect (Canadian Adverse Reaction and Medical Device Problem Reporting database), SIDER (which has been developed based on resources published by public sources, mainly the FDA such as structured product labeling (SPL)), the Drug Bank Database, and the European agency for the Evaluation of Medical Product (EMEA). The lexicons were mainly built on clinical trial findings and clinicians' reports, which often have low coverage of colloquial expressions available in consumer health posts. To address this problem, pharmacovigilance studies have used a few approaches mostly focused on augmentation of the standard medical lexicons by embedding Consumer Health Vocabularies (CHV). CHV was developed mainly with the purpose of covering colloquial expression of health professional vocabularies <ref type="bibr" target="#b14">(Zeng &amp; Tse, 2006)</ref>. Here, we explain three studies which have adopted lexiconbased approaches for identifying ADRs from consumer health posts. <ref type="bibr" target="#b6">Leaman et al. (2010)</ref> constructed a lexicon of SIDER, MedEffect, and COSTART, which was augmented with CHV and a small set of ADR colloquial expression to identify adverse drug reactions in consumer drug reviews in the "Daily Strength" forum. This study had 78.3% precision and 69.9% recall. <ref type="bibr" target="#b0">Benton et al. (2011)</ref> complied a lexicon of dietary supplements, pharmaceutical terms mentioned in the Cerner Multum's Drug Lexicon, list of signs and symptoms in the Medicinenet database, FAERS, and CHV to identify ADRs of hormonal drugs used for breast cancer treatment in breast cancer healthcare forums. The reported precision was 77% and recall 35.1% . <ref type="bibr" target="#b7">Liu and Chen (2013)</ref> constructed AZD Drug Minor on UMLS, which provided 56.5% recall and 82% precision for ADRs identification in a healthcare forum.</p><p>Pharmacovigilance studies that focused on identifying ADRs from consumer healthcare forums mostly attributed systems errors to misspelling, colloquial expression of ADRs, use of non-standard terms, and high variability of semantic representations of a specific ADR in health posts. In addition, augmentation of the standard lexicons with CHV did not improve the system's recall significantly, indicating that the CHV is not rich in colloquial expressions of ADRs. Therefore, there is a need for an annotated corpus that not only clarifies the text segments of health posts for the presence of specific information, such as ADRs, but also fills the gap between patient and clinician terminologies by mapping colloquial expressions to standard medical terminologies.</p><p>In line with this need, <ref type="bibr" target="#b2">Ginn et al. (2014)</ref> developed an open source Twitter corpus, which was built on 10,822 instances of randomly selected tweets (each instance of tweet is a maximum of 140 characters) for drugs prescribed for chronic illness. The tweets were double coded by two annotators for presence of ADRs, spans of ADRs, drug indications, and beneficial effects. For this data set, the Inter Annotator Agreement (IAA) calculated using Cohen's Kappa was 71%. The authors normalized the identified medical terms by mapping layperson expressions to the UMLS standard terminology. <ref type="bibr" target="#b5">Karimi et al. (2015)</ref> developed CADEC corpus, which was built on drug review posts in online message board "askapatients.com". The corpus consists of 1,231 comments for two sets of drugs, Diclofenac and Lipitor. The drug reviews were annotated for span of <ref type="bibr">ADRs (6,</ref><ref type="bibr">318)</ref> where mapped to both SNOMED-CT and MEDRA terminologies. The pair-wise agreement between annotators was 60.4 % , when span and annotation settings were both strict.</p><p>The UWM-ADEC corpus is specifically developed for identifying ADRs associated with psychiatric medications. Although these medications have shown substantial evidence of effectiveness in treatment of mental illness such as depression and anxiety, they are associated with significant numbers of physiological, psychological, and cognitive ADRs unique to these types of medications. We built UWM-ADEC on drug reviews from "askapatient.com" for two classes of psychiatric medications including SSRI (Selective Serotonin Reuptake Inhibitor) and SNRI (Serotonin-norepinephrine reuptake inhibitor). In addition, we identified functional problems associated with drugs' adverse effects, such as limitations in daily functioning and social activities from the drug reviews. Identifying druginduced functional problems was not previously identified in CADEC and the Twitter corpus. Functional problems can result in patient non-adherence behavior, and therefore may lead to an increased risk of illness relapse, emergency room visits and hospitalizations.</p><p>UWM-ADEC can be used for text mining systems and machine learning systems, specifically for psychiatric medication pharmocovigilance and hypothesis testing related to the impact of the ADRs on attitude, discontinuation, and other medical entities.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Methodology</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Dataset Information</head><p>We examined data from an Online Message Board (OMB) "askapatient.com" that compiles uncensored user comments on the effects of taking different types of medication from people with a range of clinical diagnoses. In this OMB, patients can record their experience with a medication by filling out a form for a medication brand name. This form is composed of eight fields including rating, reason for prescription, side-effects, comments, gender, age, duration/dosage, and date of posting the review. Patients can rate their satisfaction with drugs ranging from 1 to 5, where 1 presents the least satisfaction and 5 presents the highest satisfaction. Patients are instructed to report drug ADRs in the side-effect field and the details of their experience in the comment field. However, patients were noted to report various aspects of their experiences, such as drug effectiveness or perceived distress due to ADRs, in both fields. Table <ref type="table">1</ref> shows an example of posts for Cymbalta in "askapatient.com". Table <ref type="table">1</ref>. An example of a post for Cymbalta in "askpatient.com".</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Drug Source</head><p>We used drug review posts in "askapatients.com" to collect information for four psychiatric medications: Sertraline (brand name: Zoloft) and Escitalopram (brand name: Lexapro) from Selective Serotonin Reuptake Inhibitor (SSRI) Class and venlafaxine (brand name: Effexor XR) and duloxetine (brand name: Cymbalta) from Serotonin Norepinephrine Reuptake Inhibitor (SNRI) Class. These four drugs have been primarily prescribed for depression and mood disorders. According to a dataset from Symphony Health Solutions, these medications had the highest prescription rates in 2012.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Data Collection</head><p>Because this healthcare forum does not have application program interface (API), we designed a web-crawler to collect information from the OMB. Since there is an option for filtering drug reviews for a specific drug, we could collect the data without requiring further effort. All the data in askapatient.com is anonymous and publicly available. Therefore, we did not seek any IRB approval for the data collection phase.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Dataset Statistics</head><p>We randomly selected 892 posts from a healthcare forum called "askapatient.com". Table <ref type="table" target="#tab_0">2</ref> shows demographic information of the whole sample. The gender proportion in the sample for female is significantly higher than male for both classes of drugs. Age range of the reviewer is 14-83 years old with the average of 37, and median of 35; implying that patients less than 40 are more likely to report their experience with drugs. Duration of drug usage ranged from 1 day to 20 years with an average of 18 months and median of 5 months, indicating that the duration of usage is highly skewed due to the effect of outliers. Posting reviews as soon as 1 day of treatment may indicate patients' high concern for potential ADRs.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Annotation</head><p>We created the corpus in two main phases: (1) ADR identification and (2) terminology association, also known as normalization, in which we linked the identified entities to 1 day -20 years controlled vocabularies. In the next sections, we explain the annotation guidelines and the annotation process.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Developing Guidelines for ADRs Identification</head><p>Guidelines for ADRs identification includes the ADR definitions and rules for proper identification of entities. Table <ref type="table">3</ref> includes the entity definitions and the associated rules for identification with examples. The identification rules are related to patient certainty in linking ADRs with the drug, identifying patient subjective complaints and functional problems as ADRs, as well as excluding unnecessary context such as "similes" and "metaphors" from ADRs.</p><p>Identifying patient subjective complaints are important because they may reflect subtle physiological, psychological, or cognitive ADRs associated with drugs. For example, "felt like I couldn't stop moving" reflects patient restlessness, which is a sign of akathisia. Identifying functional problems in drug review posts is also significant, not only for understanding how ADRs influence the normal daily activities of patients and their interpersonal relationships, but also for estimating the indirect cost associated with the ADRs. Collecting this information also enhances clinicians' abilities to predict the impact of ADRs on patient functionality, such as limitations of daily activities, social participation, and work performance. We further categorized identified ADRs as physiological (Phys), Psychological (Psycho), Cognitive (Cogn), and functional problem (FP).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 3. Guidelines for ADRs identification with examples</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Annotation Process</head><p>Four annotators participated in the process of identification and extraction of the three entities explained in Table <ref type="table">3</ref>.</p><p>In the second step, the documents were divided into three sets and each set was reviewed by an annotator for entity identification. In order to calculate inter-annotator agreement, the entire dataset was reviewed by the fourth annotator. We did not extract general mentions of entities, such as "side-effects" in the sentences. For example, in these sentences, "I really suffered from side-effects," side effects and was not extracted. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Entity</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Calculating Inter-Annotator Agreement</head><p>To calculate inter-annotator agreement, we used pair-wise agreement between the annotators using the following formula (Metke-Jimenez &amp; <ref type="bibr">Karimi, 2015)</ref>:</p><formula xml:id="formula_0">𝐴𝑔𝑟𝑒𝑒𝑚𝑒𝑛𝑡 𝐴 ! , 𝐴 ! = 𝑚𝑎𝑡𝑐ℎ (𝐴 !, 𝐴 ! , 𝛼, 𝛽) 𝑚𝑎𝑥 (𝑛 ! ! , 𝑛 ! ! )</formula><p>Where A i represents the set of data annotated by annotator i; Aj represents the set of data annotated by annotator j; n A i is the size of identified entities in A i and n A j is the size of identified entities in A j ; Max (n A i , n A j ) is the maximum number of identified entities; 𝛼 parameter presents span strictness of identified entities and 𝛽 parameter represents tag strictness of identified entities. The computed pairwise agreement for strict match for ADRs identification was 0.86.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Terminology Association</head><p>While sentence classification and entity identification in drug review posts have significant implications for automatic systems that focus on information retrieval, translating these entities to the language of health professionals fills the gap between layperson and professional expressions of medical entities, such as ADRs. This translation may benefit the generation and testing of medical hypotheses by providing unambiguous and standard information for statistical data collection and analysis.</p><p>This translation process (terminology mapping) typically involves identifying terms used by healthcare consumers and mapping them to their equivalent concepts available in medical standard vocabularies. This process is also referred to as normalization in other research <ref type="bibr" target="#b5">(Karimi et al., 2015)</ref>. To normalize the entities in our corpus, we mapped the identified entities to their corresponding concepts in Unified Medical Language System (UMLS). The UMLS Metathesaurus is a compendium of many standard medical vocabularies that provides a mapping structure among vocabularies, allowing one to translate among various terminology systems. The Metathesaurus is organized by concepts. Each concept is assigned one Concept Unique Identifier (CUI) and one or more semantic type (categories). Mapping ADRs to UMLS, in addition to normalization benefit, often reveals a list of consumer health vocabularies that has not been covered by current medical terminologies.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Guidelines for Terminology Association</head><p>Due to the different ways in which symptoms, feelings, concerns etc. are described by lay persons and medical professionals, simple matching of words will sometimes fail to capture the synonymy in meaning. For example, the consumer term "feeling sick in my stomach" is equivalent to the medical term "nausea" but no words are shared. Therefore, proper mapping of consumer terms to the concepts in the UMLS must take into account both lexical and semantic matching. Since this process sometimes involves subjective judgment, to ensure consistency in mapping, we have drawn up mapping guidelines, which were iteratively updated. These guidelines were based on insights we gained by reviewing publications including clinical trial studies targeting ADRs of the drugs specified in this study and qualitative studies investigating the themes of patient experiences with the drugs. In these publications, ADRs are often grouped into three broad areas: physiological, psychological, and cognitive, an approach which we have also adopted in our study.</p><p>In some cases, the symptom mentioned by the patient is more fine-grained than the meaning of a UMLS concept, whose meaning is more general and broader in scope. In such cases, we label the map as a "specific-to-general" map. One example is the UMLS concept "executive dysfunction". According to our research, executive dysfunction as a cognitive ADR is associated with inability to initiate and follow processes of completing a task, such as problems with initiating a task, problems with organizing a task, or problems with switching between tasks. So for a patient complaint of "cannot follow through on simple tasks", we map it to "executive dysfunction" as a more general concept.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Mapping Process</head><p>Three annotators with diverse backgrounds (pharmacist, physician, and health scientist) mapped the ADRs to proper UMLS concepts based on the guidelines of mapping that we developed for this study. Annotators used the UMLS Terminology Services, UTS browser (2017) for finding proper UMLS and SNOMD-CT concepts. Example of mapping the concepts to UMLS is shown in Table <ref type="table" target="#tab_2">4</ref>. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Corpus Statistics</head><p>Table <ref type="table" target="#tab_3">5</ref> lists frequency of identified ADRs for the corpus, as well as type of ADRs separately. Overall, we identified 4776 ADRs where 31% were duplicates, with the lowest number of duplicates for functional problems, followed by psychological problems. The findings indicate the level of subjectivity of functional and psychological ADRs that leads to creating different phrases by patients to describe their feelings and problems. Functional problems only made up 2% of the total ADRs, indicating that patients prefer to discuss physical and psychological effects of the drugs in review posts rather than their impacts on their quality of life. For the purpose of designing more effective medication adherence interventions, it would be useful if healthcare forums also asked patients to report the impact of drugs on their daily functioning and social activities. Statistics for annotation from the normalization stage were also shown in Table <ref type="table" target="#tab_4">6</ref>. The final normalization set contains 695 concepts from UMLS concepts, from which 61% belong to the physiological category and only 5% of the concepts are related to the functional category. We also report the two most frequently mapped concepts with their frequencies across the corpus for each category of ADRs. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Normalization Challenge</head><p>While normalization of consumer health posts has significant implications for understanding pharmacological aspects of medications, it is a subjective process. We attempted to address this by developing guidelines that include underlying concepts for both patient and professional expressions of entities. But, some expressions strongly related to the life context of patients. For example, we did not map "hardly feel human anymore" to any concepts due to uncertainty of the underlying concepts associated with it. It is not clear what the patient meant with this expression: is it about the patient feeling emotionally detached, having a problem in performing daily activities, or is it about feeling detached from his mind and his body (de-realization)?</p><p>There were also some cases that, while the expression of an ADR is clear and can be translated to an equivalent medical concept, there are no UMLS concepts available for it. For example, brain zap, which is known as the professional term "brain shivers", does not have any concept in UMLS.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Limitations</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Sample size</head><p>The size of sample is limited to 892 posts for four psychiatric medications. While this sample size is a good representative of the four most common psychiatric medications, it may not be a good representative of other consumer posts in this forum or other healthcare forums. It is also possible that a specific group of patients tend to report their experiences with drugs in this forum leading to reporting bias.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Limitation for coverage of drug types</head><p>Our corpus covers sentence labeling and entities identifications for two classes of psychiatric medications, SSRI and SNRI. While limiting the dataset to a specific set of drugs enabled us to have a better understanding of conceptual models associated with layperson and professional expressions of medical entities, it may not include rare ADRs related to other classes of psychiatric medications and medications for other diseases or disorders.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Lack of information on drug-drug interactions, drug-herb interaction, and drug overdose</head><p>The focus of patients in review posts is mostly on the selected drug. Hence, it is not clear, whether the reported adverse effects are merely caused by the drug or it is the result of interaction of the drug with other potential drugs or herbal treatment that administered by patients. Moreover, some of the ADRs for psychiatric medications, such suicidal ideation or emergency visits can happen because of patient's overdose, this information is not available in the review posts because of the nature of these reports.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Uncertainty of data in social media</head><p>Although patient self-reported experiences is a reliable source for evaluating pharmacological effects of medications, there is still the risk of inaccurate and false information. In addition, we only identified and extracted ADRs that patients directly associated with their medications, however, there is the possibility that patients misinterpreted the symptoms of their psychiatric condition as an ADR of their psychiatric medication.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Possibility of human errors in data analysis</head><p>Although the entire data set is double coded, there is still the possibility that annotators did not interpret a sentence correctly and therefore assign a wrong label to it. In addition, the span of the identified entities may include less or more information than necessary. These problems affect the performance of any machine learning system trained on this corpus to identify drug effectiveness, ADRs, and drug indications in consumer health posts.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Conclusions and Future Work</head><p>We have created a corpus of ADRs with the purpose of improving recall and precision of automatic systems designed for identifying ADRs from social media. The source of this corpus is patient reviews of psychiatric drugs in a medical forum called askpatient.com. Sentences in review posts were annotated for the presence of ADRs and span of ADRs. This corpus can benefit researchers in several areas including 1) developing and evaluating systems that automatically identify ADRs from consumer health posts, 2) developing systems that automatically map free text to UMLS, 3) Creating a structured vocabulary of layperson expressions of adverse effects and indications which can be used in electric health records (EHR) for facilitating seamless information between patients and clinicians. This can be achieved by mapping information in personal health records (PHR) to EHR systems.</p><p>We are in the process of annotating withdrawal symptoms, drug indications, and effectiveness from the consumer health reports. We are also mapping the entities to corresponding terms in SNOMED-CT terminology.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0"><head></head><label></label><figDesc></figDesc><graphic coords="3,318.04,491.64,236.64,253.68" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>Table 2 .</head><label>2</label><figDesc>Corpus statistics</figDesc><table><row><cell cols="2">Rating Reason</cell><cell>Side-effect</cell><cell>Comment</cell><cell cols="2">Gender Age Duration</cell><cell>Date</cell></row><row><cell>3</cell><cell>fibromyalg</cell><cell>Nausea, diarrhea,</cell><cell>I have only been on 30mg for 4 days</cell><cell>F</cell><cell>38 4 days</cell><cell>2009-10-05</cell></row><row><cell></cell><cell>ia/depressi</cell><cell>upset stomach, dry</cell><cell>and have the extreme runs. Upset</cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell>on</cell><cell>mouth, sleepiness</cell><cell>stomach and no appetite. Pain in</cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell>minimal though and I feel less</cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell>anxious and depressed.</cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell cols="2">Dataset statistics</cell><cell cols="2">Dataset</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Sample Size</cell><cell></cell><cell></cell><cell>892</cell></row><row><cell></cell><cell></cell><cell></cell><cell cols="2">No. of reviews with text</cell><cell></cell><cell>887</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Time span</cell><cell></cell><cell cols="2">Feb 2001</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Sep 2016</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Rating</cell><cell></cell><cell></cell><cell>3.16</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Gender</cell><cell></cell><cell cols="2">F 669 (76%)</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">M 212 (24%)</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Missing value (11)</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Age</cell><cell></cell><cell cols="2">Avg. 37</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Med. 35</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Missing values (12)</cell></row><row><cell></cell><cell></cell><cell></cell><cell>Age range</cell><cell></cell><cell></cell><cell>14-83</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Missing values (3)</cell></row><row><cell></cell><cell></cell><cell></cell><cell cols="2">Duration of usage</cell><cell cols="2">Avg. 18 months</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Med. 5 month</cell></row><row><cell></cell><cell></cell><cell></cell><cell cols="3">Duration of usage (range)</cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 4 .</head><label>4</label><figDesc>Examples of mapping ADRs to UMLS Concepts</figDesc><table><row><cell>Drug_ID</cell><cell>Sen_ID</cell><cell>Original Term</cell><cell>UMLS (1)</cell></row><row><cell cols="2">cymbalta.124 1</cell><cell>Felt sick</cell><cell>C0857027 / Feeling Sick /Sign or Symptom</cell></row><row><cell>lexapro.12</cell><cell>3</cell><cell>"Zombie" like</cell><cell>C0857486/ Felt like a zombie/ Finding</cell></row><row><cell>cymbalta.12</cell><cell>2</cell><cell>Constipation</cell><cell>C0009806/ Constipation/ Sign or Symptom</cell></row><row><cell cols="2">cymbalta.131 1</cell><cell>Excessive sleepiness</cell><cell>C0013144/ Drowsiness/ Finding</cell></row><row><cell>Effexor.78</cell><cell>1</cell><cell>Minor muscle spasms</cell><cell>C0037763 / Spasm/ Sign or Symptom</cell></row><row><cell>effexor.97</cell><cell>2</cell><cell>Sweating like crazy all the</cell><cell>C0700590 / Increased sweating / Sign or</cell></row><row><cell></cell><cell></cell><cell>time</cell><cell>Symptom</cell></row><row><cell>effexor.111</cell><cell>7</cell><cell>Brain zap</cell><cell>No concept</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3"><head>Table 5 .</head><label>5</label><figDesc>Frequency of identified ADRs for the total corpus</figDesc><table><row><cell></cell><cell></cell><cell>Total</cell><cell cols="2">Physiological</cell><cell cols="2">Psychological</cell><cell></cell><cell>Cognitive</cell><cell></cell><cell>Functional</cell></row><row><cell></cell><cell cols="3">ADRs Unique All</cell><cell cols="2">Unique All</cell><cell>Unique</cell><cell>All</cell><cell>Unique</cell><cell cols="2">All Unique</cell></row><row><cell>ADRs</cell><cell>4776</cell><cell>69%</cell><cell>3522</cell><cell>64%</cell><cell>900</cell><cell>81%</cell><cell cols="2">272 80% (All)</cell><cell>82</cell><cell>95% (All)</cell></row><row><cell>in</cell><cell></cell><cell>(All)</cell><cell></cell><cell>(All)</cell><cell></cell><cell>(All)</cell><cell></cell><cell>(217)</cell><cell></cell><cell>(78)</cell></row><row><cell>Corpus</cell><cell></cell><cell>3285</cell><cell></cell><cell>2274</cell><cell></cell><cell>(716)</cell><cell></cell><cell></cell><cell></cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_4"><head>Table 6 .</head><label>6</label><figDesc>Statistics for annotation from normalization stage</figDesc><table><row><cell></cell><cell>Total</cell><cell>Physiological</cell><cell>Psychological</cell><cell>Cognitive</cell><cell>Functional</cell></row><row><cell>No. Unique</cell><cell>695</cell><cell>425 (61% total)</cell><cell cols="3">196 (28% total) 42 (6% total) 31 (5%)</cell></row><row><cell>concepts</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell>No. Unique</cell><cell>695</cell><cell>425 (61% total)</cell><cell cols="3">196 (28% total) 42 (6% total) 31 (5%)</cell></row><row><cell>concepts</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell>1st most freq.</cell><cell>Sleeplessness</cell><cell>Sleeplessness</cell><cell>Anxiety (94)</cell><cell>Foggy feeling</cell><cell>Difficulty in daily</cell></row><row><cell>concept</cell><cell>(171)</cell><cell>(171)</cell><cell></cell><cell>in head (47)</cell><cell>functioning (10)</cell></row><row><cell>2st most freq.</cell><cell cols="2">Nausea (169) Nausea (169)</cell><cell>Detailed recall</cell><cell>Unable to</cell><cell>Emergency room</cell></row><row><cell>concept</cell><cell></cell><cell></cell><cell>of dream (62)</cell><cell>concentrate</cell><cell>admission (9)</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell>(30)</cell><cell></cell></row></table></figure>
		</body>
		<back>
			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Identifying potential adverse effects using the web: A new approach to medical hypothesis generation</title>
		<author>
			<persName><forename type="first">A</forename><surname>Benton</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ungar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hill</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Hennessy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Mao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Chung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">.</forename><forename type="middle">.</forename><surname>Holmes</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">H</forename></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Biomedical Informatics</title>
		<imprint>
			<biblScope unit="volume">44</biblScope>
			<biblScope unit="issue">6</biblScope>
			<biblScope unit="page" from="989" to="996" />
			<date type="published" when="2011">2011</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">A simple algorithm for identifying negated findings and diseases in discharge summaries</title>
		<author>
			<persName><forename type="first">W</forename><forename type="middle">W</forename><surname>Chapman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Bridewell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Hanbury</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><forename type="middle">F</forename><surname>Cooper</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">G</forename><surname>Buchanan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Biomedical Informatics</title>
		<imprint>
			<biblScope unit="volume">34</biblScope>
			<biblScope unit="issue">5</biblScope>
			<biblScope unit="page" from="301" to="310" />
			<date type="published" when="2001">2001</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Mining Twitter for adverse drug reaction mentions: a corpus and classification benchmark</title>
		<author>
			<persName><forename type="first">R</forename><surname>Ginn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Pimpalkhute</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Nikfarjam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Patki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>O'connor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sarker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">.</forename><forename type="middle">.</forename><surname>Gonzalez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the fourth workshop on building and evaluating resources for health and biomedical text processing</title>
				<meeting>the fourth workshop on building and evaluating resources for health and biomedical text processing</meeting>
		<imprint>
			<date type="published" when="2014">2014</date>
		</imprint>
	</monogr>
	<note>Paper presented at the</note>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Extraction of potential adverse drug events from medical case reports</title>
		<author>
			<persName><forename type="first">H</forename><surname>Gurulingappa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mateen-Rajput</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Toldo</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">J Biomed Semantics</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page">15</biblScope>
			<date type="published" when="2012">2012</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<title level="m" type="main">Adverse drug reaction classification with deep neural networks</title>
		<author>
			<persName><forename type="first">T</forename><surname>Huynh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>He</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Willis</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Rüger</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">CADEC: A corpus of adverse drug event annotations</title>
		<author>
			<persName><forename type="first">S</forename><surname>Karimi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Metke-Jimenez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Kemp</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Wang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Biomedical Informatics</title>
		<imprint>
			<biblScope unit="volume">55</biblScope>
			<biblScope unit="page" from="73" to="81" />
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Towards internet-age pharmacovigilance: extracting adverse drug reactions from user posts to health-related social networks</title>
		<author>
			<persName><forename type="first">R</forename><surname>Leaman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wojtulewicz</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Sullivan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Skariah</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Gonzalez</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2010 workshop on biomedical natural language processing</title>
				<meeting>the 2010 workshop on biomedical natural language processing</meeting>
		<imprint>
			<date type="published" when="2010">2010</date>
		</imprint>
	</monogr>
	<note>Paper presented at the</note>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<title level="m" type="main">AZDrugMiner: an information extraction system for mining patient-reported adverse drug events in online patient forums Smart Health</title>
		<author>
			<persName><forename type="first">X</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Chen</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2013">2013</date>
			<publisher>Springer</publisher>
			<biblScope unit="page" from="134" to="150" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<monogr>
		<title level="m" type="main">Concept Extraction to Identify Adverse Drug Reactions in Medical Forums: A Comparison of Algorithms</title>
		<author>
			<persName><forename type="first">A</forename><surname>Metke-Jimenez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Karimi</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1504.06936</idno>
		<imprint>
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Evaluation of text-processing algorithms for adverse drug event extraction from social media</title>
		<author>
			<persName><forename type="first">A</forename><surname>Metke-Jimenez</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Karimi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Paris</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the first international workshop on Social media retrieval and analysis</title>
				<meeting>the first international workshop on Social media retrieval and analysis</meeting>
		<imprint>
			<date type="published" when="2014">2014</date>
		</imprint>
	</monogr>
	<note>Paper presented at the</note>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Estimation of the prevalence of adverse drug reactions from social media</title>
		<author>
			<persName><forename type="first">T</forename><surname>Nguyen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">E</forename><surname>Larsen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>O'dea</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Phung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Venkatesh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Christensen</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Int J Med Inform</title>
		<imprint>
			<biblScope unit="volume">102</biblScope>
			<biblScope unit="page" from="130" to="137" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Pharmacovigilance from social media: mining adverse drug reaction mentions using sequence labeling with word embedding cluster features</title>
		<author>
			<persName><forename type="first">A</forename><surname>Nikfarjam</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Sarker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>O'connor</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Ginn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Gonzalez</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of the American Medical Informatics Association</title>
		<imprint>
			<date type="published" when="2015">2015. ocu041</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Portable automatic text classification for adverse drug reaction detection via multi-corpus training</title>
		<author>
			<persName><forename type="first">A</forename><surname>Sarker</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Gonzalez</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Biomedical Informatics</title>
		<imprint>
			<biblScope unit="volume">53</biblScope>
			<biblScope unit="page" from="196" to="207" />
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Social media mining for drug safety signal detection</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">C</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2012 international workshop on Smart health and wellbeing</title>
				<meeting>the 2012 international workshop on Smart health and wellbeing</meeting>
		<imprint>
			<date type="published" when="2012">2012</date>
		</imprint>
	</monogr>
	<note>Paper presented at the</note>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Exploring and developing consumer health vocabularies</title>
		<author>
			<persName><forename type="first">Q</forename><forename type="middle">T</forename><surname>Zeng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Tse</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of the American Medical Informatics Association</title>
		<imprint>
			<biblScope unit="volume">13</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page" from="24" to="29" />
			<date type="published" when="2006">2006</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
