<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Demonstrations / success stories On the Performance of Large Scale Bayesian Phylogenetic Analyses with Grid Portals and Robot Certificates</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Roberto</forename><surname>Barbera</surname></persName>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Italian National Institute of Nuclear Physics</orgName>
								<orgName type="department" key="dep2">Division of Catania</orgName>
								<address>
									<addrLine>Via S. Sofia 64</addrLine>
									<postCode>I-95123</postCode>
									<settlement>Catania</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">Department of Physics and Astronomy</orgName>
								<orgName type="institution">University of Catania</orgName>
								<address>
									<addrLine>Viale A. Doria 6</addrLine>
									<postCode>I-95125</postCode>
									<settlement>Catania</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giacinto</forename><surname>Donvito</surname></persName>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Italian National Institute of Nuclear Physics</orgName>
								<orgName type="department" key="dep2">Division of Bari</orgName>
								<address>
									<addrLine>Via E. Orabona 4</addrLine>
									<postCode>I-70126</postCode>
									<settlement>Bari</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Alberto</forename><surname>Falzone</surname></persName>
							<affiliation key="aff3">
								<orgName type="institution">NICE Srl</orgName>
								<address>
									<addrLine>Via Milliavacca 9</addrLine>
									<postCode>I-14100</postCode>
									<settlement>Asti</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giuseppe</forename><forename type="middle">La</forename><surname>Rocca</surname></persName>
							<affiliation key="aff0">
								<orgName type="department" key="dep1">Italian National Institute of Nuclear Physics</orgName>
								<orgName type="department" key="dep2">Division of Catania</orgName>
								<address>
									<addrLine>Via S. Sofia 64</addrLine>
									<postCode>I-95123</postCode>
									<settlement>Catania</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giorgio</forename><forename type="middle">Pietro</forename><surname>Maggi</surname></persName>
							<affiliation key="aff2">
								<orgName type="department" key="dep1">Italian National Institute of Nuclear Physics</orgName>
								<orgName type="department" key="dep2">Division of Bari</orgName>
								<address>
									<addrLine>Via E. Orabona 4</addrLine>
									<postCode>I-70126</postCode>
									<settlement>Bari</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Saverio</forename><surname>Vicario</surname></persName>
							<affiliation key="aff5">
								<orgName type="institution">CNR -ITB Bari</orgName>
								<address>
									<addrLine>Via Amendola 122D</addrLine>
									<postCode>I-70126</postCode>
									<settlement>Bari</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Luciano</forename><surname>Milanesi</surname></persName>
							<affiliation key="aff4">
								<orgName type="department">Institute for Biomedical Technologies -CNR</orgName>
								<address>
									<addrLine>Via Fratelli Cervi 93</addrLine>
									<postCode>I-20090</postCode>
									<settlement>Segrate (</settlement>
									<region>MI)</region>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sandra</forename><surname>Gesing</surname></persName>
						</author>
						<author>
							<persName><forename type="first">Jano</forename><surname>Van Hermet</surname></persName>
						</author>
						<title level="a" type="main">Demonstrations / success stories On the Performance of Large Scale Bayesian Phylogenetic Analyses with Grid Portals and Robot Certificates</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">6AD02AE1C9E6070E28142345E634DA58</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-25T05:21+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>ICT based Infrastructures today play a crucial role in the development of collaborations among scientists to address new global scientific challenges, particularly those that have high societal and economic impact. In the Life Science domain, for instance, the massive computational resources exposed by Grid infrastructures is indispensable when dealing both with the complexity of models and the enormous quantity of data to be processed, for example, to perform genome scale analysis or when carrying out docking simulations for the study of new drugs. At present, Grid technology is presented to end-users as a collection of virtual services and complex protocols and this makes its full exploitation very complicated. A notable step forward to foster the adoption of this technology in e-Science has recently been achieved with the adoption of portals and robot certificates. Robot certificates have been conceived and introduced to allow non expert users to access Grid Infrastructures and reduce the initial barriers. Each robot certificate is associated with a function which identifies the specific application the user wants to share with all the members of the same community. In this manuscript the solution proposed by the Italian National Institute of Nuclear Physics to allow bioinformaticians to access the Grid via a portal enabled by a robot certificate and perform large scale Bayesian Phylogenetic analyses is presented. The solution described in this manuscript strongly simplifies the exploitation and the utilization of Grid Infrastructures and represents a valuable step forward towards the adoption of this computing paradigm in Life Sciences.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">INTRODUCTION</head><p>* To whom correspondence should be addressed.</p><p>The study of molecular evolution is based on the principle hypothesis that the level of similarity between genes displays the degree of evolutionary relationship between them. The reconstruction of the evolutionary history of a group of organisms (a phylogeny) is used throughout Life Sciences, as it offers a mean to organize the knowledge and data accumulated by researchers. It is commonly known that the unraveling of phylogenetic relationships between organisms' gene sequences is an important first step towards the understanding of their evolution. Molecular evolution is generally studied only by inference: a pattern (in this case a set of biological sequences) is observed and different possible processes are evaluated to infer what process produced this pattern. The MrBayes program (http://mrbayes.csit.fsu.edu/) <ref type="bibr" target="#b6">(Ronquist et al. 2003</ref>) uses a Markovian integration to obtain samples from the posterior distribution of the parameters to calculate the inference. The program generates a Bayesian phylogenetic inference among different aligned bio-sequences. The inference allows to identify the distribution of the most likely genetic relationship among the set of chosen bio-sequences and, at the same time, the best set of values for the parameters of the postulated model of evolution of the bio-sequences. MrBayes has a great richness of models of evolution for DNA (both as nucleotide and codon), RNA (model for evolution of doublet of nucleotide to model the secondary structure of an RNA molecule), protein, and even arbitrary hereditary discrete characters. Another peculiarity of the application is that it allows the usage of "mixed" models such as using different models for different parts of each biosequence with the possibility to share parameters among the different models. The program uses a Metropolis-Coupled Monte Carlo Markov Chain (MCMCMC) to perform the Markovian integration necessary to solve numerically the Bayesian equation <ref type="bibr" target="#b0">(Altekar et al. 2004)</ref>. Due to the nature of Bayesian inference, in order to achieve the better estimation, the MrBayes program has to run for millions of iterations (generations) which require a large amount of computation time. The input required is a single text file, nexus formatted <ref type="bibr" target="#b5">(Maddison et al. 1997)</ref>, subdivided in a data block and a MrBayes block in which the models and the parameters of Markovian integration are defined and declared. The output consists of three kinds of large files (typically in the order of several hundreds of Megabytes each) that describe, respectively, the posterior distribution of numerical and topological parameters and several diagnostic measures related to the mixing of Markov chains and the converging of the algorithm as a whole. The use of a distributed version of MrBayes is more problematic given the nature of the Markovian integration. Typically, each MrBayes run, although starting from a fairly small input data, has a quite long execution time. This is a typical analysis which can be tackled by a "high-throughput" approach made feasible with the use of Grid infrastructures. This is why in this work we decided to restrict the executions, each made of a single complete analysis, to high performance nodes of the EGEE grid (http://www.eu-egee.org/) configured to accept MPI jobs. This ensured that jobs would run in sufficiently efficient manner and they would arrive at completion before the maximum run time allowed by the chosen nodes of the EGEE grid. In addition, to allow all the bioinformaticians of the LIBI project (http://www.libi.it/) to access the computing resources of the EGEE Grid Infrastructure without owning a personal X.509 certificate and run the parallel version of MrBayes, the credentials of a robot certificate have been made available with the GENIUS Grid portal according to the architectural schema reported in the next section. Thanks to the introduction of Grid portals and robot certificates now it is possible to reduce the initial barriers and extend the benefits of the Grid paradigm to a wider community of users. In section 2 the details of the distributed grid environment designed and deployed in the context of the LIBI project to perform phylogenetic analyses on a large scale is presented. Results are summarized in section 3.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">METHODS</head><p>The EnginFrame (http://www.enginframe.com/) Java framework (ver. 4.1), on which the GENIUS Grid portal <ref type="bibr" target="#b1">(Andronico et al. 2003</ref><ref type="bibr" target="#b2">, Barbera et al. 2007</ref>) is built, has been enhanced in order to provide a transparent support to robot certificates and allow non-expert Grid users to access the distributed computational resources of a Grid using a conventional web browser. The additional features introduced in GENIUS Grid portal are sketched in fig. <ref type="figure">1</ref> Fig. <ref type="figure">1</ref> -The workflow set up to perform large scale phylogenetic analyses.</p><p>The distributed environment for the application has been built on top of the EGEE stack (http://public.eu-egee.org/) using many gLite (http://www.glite.org/) services such as: the LFC File Catalog and Data Management System, Storage Elements, Workload Management System and X.509 certificates. The client side is represented by a user's workstation running a web browser (bottom left in the figure). The server side is represented by a gLite User Interface (UI) machine, equipped with the latest stable release of middleware services to submit jobs and manage data on Grid, the Apache Web Server, the Java/XML portal framework EnginFrame (http://www.enginframe.com/docum/) developed by NICE Srl (http://www.nice-italy.com/) and the GENIUS Grid Portal itself (bottom center in the figure). After user's login, a proxy certificate is requested by the portal to access the distributed resources of a Grid Infrastructure according to the Grid Security Infrastructure (GSI) standard. If no proxy is available, the credentials of the robot certificate, if any, will be read by the GENIUS Grid portal to generate in a few seconds the needed proxy. This operation is completely transparent to end-users. Once the proxy certificate has successfully been created, users are automatically redirected to the home page of the application related to the robot certificate. In this context the robot certificate has been requested to run the phylogenetic application, thus after login he/she is redirected to the MrBayes' home page. For this purpose some dedicated services have been designed and implemented into the GENIUS portal to allow users to specify input settings before sending parallel instances of MrBayes jobs to the Grid. Besides, in order to enhance the reliability and the performance of the architecture, the support for the submission/re-submission of a large number of jobs in an almost unattended way has also been introduced (bottom right in the figure). This tool is based on the concept of "task" to be executed (De <ref type="bibr" target="#b4">Sario et al. 2009)</ref>. The entire problem is first subdivided into elementary tasks, then all the tasks are inserted into a DB server. In the submission phase all the jobs are completely identical. Only when the jobs lands and starts executing on a worker node, it requests to the central DB a task to execute. Information on the execution of each task is logged into the central DB. Only if all steps are correctly executed by the job, the status of that particular task on the central DB is updated to "Done". In this way the central DB provides a monitoring of the task execution and no manual intervention is required to manage the re-submission of the failed tasks. Tasks which are found in a "running" state after a given time interval are considered failed and automatically reassigned to new jobs. Figure <ref type="figure" target="#fig_0">2</ref> shows the service introduced into the portal to query the central database of the Job Submission Tool and monitor users' tasks. This service, based on HTML, XML, JavaScript and PHP refreshes data every 5 minutes.  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3">RESULTS</head><p>The adoption of a personal certificate to access Grid resources has represented so far a limiting factor for the real spreading of the Grid paradigm in many types of researches. Many scientists are interested in using Grid as a tool to solve their computing problems and speed up the production of scientific results but the basis of the Grid Security Infrastructure risks to discourage many of them. The benefits introduced by the GENIUS portal and robot certificates in Life Sciences are far reaching because they can contribute both to effectively reduce the scientific gap requested to access Grid infrastructures and make the adoption of this technology transparent not only to biologists but also to many other scientific communities. The solution presented in this paper has been successfully evaluated and adopted by different EU co-founded projects. In the context of the e-NMR project (http://ww.enmr.eu/) the HADDOCK web portal (http://www.haddocking.eu/) makes use of robot certificates issued by the Dutch CA according with the VO Portal Policy draft documented by the Joint Security Policy Group (JSPG) of EGEE. The portal gives access to information-driven docking at various levels of expertise, from an easy to a guru interface providing full control on the docking parameters. The GridSPM is another web portal <ref type="bibr">(Corradi et al.)</ref>, that allows the statistical analysis of SPECT and PET cerebral images through the Statistical Parameter Mapping (SPM) system (http://www.neuroinf.it/medico/Analisi/). Finally, in the context of the EU co-founded GRIDCC (http://www.gridcc.org/) and DORII (http://www.dorii.eu/) projects, ELETTRA (http://www.elettra.trieste.it/) has developed the Virtual Room (VCR) (http://www.dorii.eu/middleware) , a grid portal which allows users to interactively control remote Instruments Elements and supports both user and robot certificates.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Fig. 2 -</head><label>2</label><figDesc>Fig. 2 -View of the task monitoring system. The completed tasks obtained at the end of the analysis are stored in a Grid Storage Element (SE) and then downloaded locally in the user's home directory by means of a pop-up service. The results of the computation can then be used to display the phylogenetic tree with third party software like, for example TreeViewX (http://darwin.zoology.gla.ac.uk/~rpage/treeviewx/).</figDesc><graphic coords="2,315.24,429.84,223.32,205.80" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Fig. 3 -</head><label>3</label><figDesc>Fig. 3 -Display of the phylogenetic tree with TreeViewX.</figDesc><graphic coords="3,62.16,117.36,228.72,171.12" type="bitmap" /></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>ACKNOWLEDGEMENTS</head><p>We gratefully acknowledge all the people who supported this work contributing with ideas, comments and feedback and the e-Science Institute in Edinburgh.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Funding</head><p>: This work was supported by the MUR FIRB LIBI "Italian Laboratory for Bioinformatics", LITBIO (http://www.litbio.org/, RBLA0332RH), and ITALBIONET (RBPR05ZK2Z_001) Italian projects and by the EGEE-III (contract number: 222667) and BIOINFOGRID (http://www.bioinfogrid.eu/, contract number: 026808) European projects.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Parallel Metropolis coupled Markov chain Monte Carlo for Bayesian phylogenetic inference</title>
		<author>
			<persName><forename type="first">G</forename><surname>Altekar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Dwarkadas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Huelsenbeck</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Ronquist</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Bioinformatics</title>
		<imprint>
			<biblScope unit="volume">20</biblScope>
			<biblScope unit="issue">3</biblScope>
			<biblScope unit="page" from="407" to="415" />
			<date type="published" when="2004">2004</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Rodolico A -GENIUS: a web portal for the grid</title>
		<author>
			<persName><forename type="first">G</forename><surname>Andronico</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Barbera</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Falzone</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lo Re</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Pulvirenti</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Nucl. Instrument and Methods in Phy. Res. A</title>
		<imprint>
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">The GENIUS Grid Portal: Its Architecture, Improvements of Features and New Implementations about Authentication and Authorization</title>
		<author>
			<persName><forename type="first">R</forename><surname>Barbera</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">16th IEEE International Workshops on Enabling Technologies: Infrastructure for Collaborative Enterprises (WETICE 2007)</title>
				<imprint>
			<date type="published" when="2007">2007</date>
			<biblScope unit="page" from="279" to="283" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">XTENS -an eXTensible Environment for NeuroScience</title>
		<author>
			<persName><forename type="first">L</forename><surname>Corradi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">HealthGrid 2009</title>
				<meeting><address><addrLine>Berlin</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2009">2009</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">-The Job Submission Tool, &quot;High-throughput GRID computing for Life Sciences</title>
		<author>
			<persName><forename type="first">De</forename><surname>Sario</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Handbook of Research on Computational Grid Technologies for Life Sciences, Biomedicine and Healthcare, IGI Global</title>
		<editor>Mario Cannataro</editor>
		<imprint>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="198" to="203" />
			<date type="published" when="2009">2009</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">NEXUS: an extensible file format for systematic information</title>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">R</forename><surname>Maddison</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Syst. Biol</title>
		<imprint>
			<biblScope unit="page" from="590" to="621" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">MrBayes 3: Bayesian phylogenetic inference under mixed models</title>
		<author>
			<persName><forename type="first">F</forename><surname>Ronquist</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">P</forename><surname>Huelsenbeck</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Bioinformatics</title>
		<imprint>
			<biblScope unit="volume">19</biblScope>
			<biblScope unit="issue">12</biblScope>
			<biblScope unit="page" from="1572" to="1574" />
			<date type="published" when="2003">2003</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
