<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta />
    <article-meta>
      <title-group>
        <article-title>Scientific Data Analysis Using Data-Intensive Scalable Computing: the SciDISC Project</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author">
          <string-name>Patrick Valduriez</string-name>
          <email>Patrick.Valduriez@inria.fr</email>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Marta Mattoso</string-name>
          <email>marta@cos.ufrj.br</email>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Reza Akbarinia</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Heraldo Borges</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>José Camata</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Alvaro Coutinho</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Daniel Gaspar</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Noel Lemus</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Ji Liu</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Hermano Lustosa</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Florent Masseglia</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Fabricio Nogueira da Silva</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Vítor Silva</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Renan Souza</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Kary Ocaña</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Eduardo Ogasawara</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Daniel de Oliveira</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Esther Pacitti</string-name>
          <xref ref-type="aff" rid="aff0">0</xref>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Fabio Porto</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Dennis Shasha</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>COPPE/UFRJ</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Rio de Janeiro - RJ - Brazil</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>CEFET/RJ</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Rio de Janeiro - RJ - Brazil</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Rio de Janeiro - RJ - Brazil</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Rio de Janeiro - RJ - Brazil</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>New York - NY - USA</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Science</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <string-name>Workshop</string-name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <aff id="aff0">
          <label>0</label>
          <institution>Inria, LIRMM and University Montpellier -</institution>
          <country country="FR">France</country>
        </aff>
        <aff id="aff1">
          <label>1</label>
          <institution>Invited paper</institution>
          ,
          <addr-line>Latin American VLDB 2018, Rio de Janeiro</addr-line>
          ,
          <country country="BR">Brazil</country>
        </aff>
      </contrib-group>
      <pub-date>
        <year>2018</year>
      </pub-date>
      <abstract>
        <p>Data-intensive science requires the integration of two fairly different paradigms: high-performance computing (HPC) and data-intensive scalable computing (DISC), as exemplified by frameworks such as Hadoop and Spark. In this context, the SciDISC project addresses the grand challenge of scientific data analysis using DISC, by developing architectures and methods to combine simulation and data analysis. SciDISC is an ongoing project between Inria, several research institutions in Rio de Janeiro and NYU. This paper introduces the motivations and objectives of the project, and reports on the first results achieved so far.</p>
      </abstract>
    </article-meta>
  </front>
  <body>
    <sec id="sec-1">
      <title>1. Introduction</title>
      <p>
        Modern science such as astronomy, biology, computational engineering and
environmental science must deal with overwhelming amounts of data (e.g. coming from
sensors and scientific instruments, or produced by simulation). Increasingly, scientific
breakthroughs will be powered by advanced computing capabilities that help researchers
manipulate and explore
        <xref ref-type="bibr" rid="ref14">these massive datasets [Hey 2009</xref>
        ].
*
      </p>
      <p>(LADaS),</p>
      <p>
        Such data-in
        <xref ref-type="bibr" rid="ref8">tensive science [Critchlow 2013</xref>
        ] requires the integration of two
fairly different paradigms: high-performance computing (HPC) and data-intensive
scalable computing (DISC). HPC is compute-centric and focuses on high-performance
of simulation applications, typically using powerful, yet expensive, supe
        <xref ref-type="bibr" rid="ref3">rcomputers.
DISC [Bryant 2011</xref>
        ], on the other hand, is data-centric and focuses on fault-tolerance
and scalability of web and cloud applications using cost-effective clusters of commodity
hardware. Examples of DISC systems include big data processing frameworks such as
Hadoop or Apache Spark or NoSQL systems
        <xref ref-type="bibr" rid="ref2">(see [Bondiombouy 2016], which includes
a survey of DISC systems)</xref>
        . To harness parallel processing, HPC uses a low-level
programming model (such as MPI or OpenMP) while DISC relies on powerful data
processing operators (Map, Reduce, Filter, etc.). Data storage is also quite different:
supercomputers typically rely on a shared disk infrastructure and data must be loaded in
compute nodes before processing while DISC systems rely on a shared-nothing cluster
(of disk-based nodes) and data partitioning.
      </p>
      <p>
        Spurred by the growing need to analyze big scientific data, the convergence
between HPC and DISC has been a recent topic of inte
        <xref ref-type="bibr" rid="ref26">rest [Coutinho 2014</xref>
        , Valduriez
2015]. However, simply porting the Hadoop stack on a supe
        <xref ref-type="bibr" rid="ref5">rcomputer [Fox 2016</xref>
        ] is not
cost-effective, and does not solve the scalability and fault-tolerance issues addressed by
DISC. On the other hand, DISC systems have not been designed for scientific
applications, which have different requirements in terms of data analysis and
visualization.
      </p>
      <p>
        This international project between Inria (France), several research institutions in
Rio de Janeiro (Brazil) and NYU (USA), addresses the grand challenge of scientific data
analysis using DISC (SciDISC), by developing architectures and methods to combine
simulation and data analysis. We can distinguish between three main approaches
depending on where
        <xref ref-type="bibr" rid="ref23 ref7">analysis is done [Oldfield 2014</xref>
        ]: post-processing, in-situ and
intransit. Post-processing analysis performs analysis after simulation, e.g. by loosely
coupling a supercomputer and a SciDISC cluster (possibly in the cloud). This approach
is the simplest but is restricted to batch analysis. In-situ analysis runs on the same
compute resources as the simulation, e.g. a supercomputer, thus making it easy to
perform interactive analysis. In-transit analysis offloads analysis to a separate partition
of compute resources, e.g. using a single cluster with both compute nodes and data
nodes that communicate through a high-speed network. Although less intrusive than
insitu, this approach requires careful synchronization of simulation and analysis.
      </p>
      <p>In the SciDISC project, we study different architectures for SciDISC and their
trade-offs. We address the following main steps of the data-intensive science process:
(1) data preparation, including raw data ingestion (e.g. from sensors) and data cleaning,
transformation and integration; (2) data processing and simulation execution; (3)
exploratory data analysis and visualization; (4) data mining, knowledge discovery and
recommendation. Note that these steps are not necessarily sequential, for instance, steps
2 and 3 need to be interleaved to perform real time analysis.</p>
      <p>The expected results of SciDISC are: new data analysis methods for SciDISC
systems; the integration of these methods as software libraries in popular DISC systems,
such as Apache Spark; and extensive validation on real scientific applications, by
working with our scientific partners such as INRA and IRD in France and Petrobras, the
National Research Institute (INCT) on e-medicine (MACC) and the e-astronomy
laboratory LIneA in Brazil.</p>
      <p>In the rest of this paper, we report on our first results. Section 2 discusses a
generic SciDISC architecture that serves as a basis for developing new distributed and
parallel techniques to deal with scientific data analysis. Section 3 deals with interactive
analysis of simulation data and visualization. Section 4 addresses data mining of
scientific data. Section 5 deals with the use of machine learning for recommendation in
SciDISC. Section 6 concludes and gives our future research directions.</p>
    </sec>
    <sec id="sec-2">
      <title>2. SciDISC Architecture</title>
      <p>The first part of the project has been devoted to the definition of a SciDISC architecture
that serves as a basis for developing new distributed and parallel techniques to deal with
scientific data. We consider a generic architecture that features a high-performance
computer (e.g. to perform data processing and simulation) with shared-disk and a
shared-nothing cluster to perform data analysis. The high-performance computer can be
a supercomputer (e.g. LNCC Santos Dumont supercomputer) or a large cluster of
compute nodes (e.g. Grid5000), which yields different cost-performance trade-offs to be
studied. Figure 1 illustrates an infrastructure of in-transit data analysis of simulation
data, where simulation and computation of predictions are performed at a
supercomputer while the analysis of results to evaluate the simulation quality and
interpret the simulated phenomenon is done at a cluster.</p>
      <p>
        This architecture allows us to design generic techniques for data transfer,
partitioning and replication, as a basis for parallel data analysis and fault-tolerance in
DISC [Liroz-Gis
        <xref ref-type="bibr" rid="ref8">tau 2013</xref>
        , Silv
        <xref ref-type="bibr" rid="ref16 ref9">a 2017</xref>
        , Souz
        <xref ref-type="bibr" rid="ref16 ref9">a 2017</xref>
        <xref ref-type="bibr" rid="ref16 ref9">a, 2017</xref>
        b]. Additionally, envisioning
an almost real-time data transfer between the HPC system and the analytics platform, an
orchestrated and tuned set of components must be devised [M
        <xref ref-type="bibr" rid="ref22">atheus 2018</xref>
        ]. Security
concerns, for instance, may restrict the exposure of simulation results through a single
HPC entry node, which rapidly turns into a bottleneck at the HPC side.
      </p>
    </sec>
    <sec id="sec-3">
      <title>3. From Simulation to Interactive Analysis and Visualization</title>
      <p>In complex simulations, users must track quantities of interest (residuals, errors
estimates, etc.) to control as much execution as possible. However, this tracking is
typically done only after the simulation ends. We are designing techniques to extract,
index and relate strategic simulation data for online queries while simulation is running.</p>
      <p>We consider coupling these techniques with largely adopted libraries such as
libMesh (for numerical solvers) and ParaView (for visualization), so that queries on
quantities of interest are enhanced by visualization and provenance data. Interactive data
analysis support is planned for post-simulation and runtime as in-situ and in-transit,
taking advantage of memory access at runtime.</p>
      <p>
        In [Silva 2017], we propose a solution (architecture and algorithms) to combine
the advantages of a dataflow-aware SWMS and the raw data file analysis techniques to
allow for queries on raw data file elements that are related but reside in separate files.
Armful (https://hpcdb.github.io/armful/) is the name of the architecture and its main
components are a raw data extractor, a provenance gatherer and a query processing
interface, which are all dataflow awa
        <xref ref-type="bibr" rid="ref36 ref37 ref6">re. In [Silva 2017</xref>
        ], we instantiate Armful with the
Chi
        <xref ref-type="bibr" rid="ref3">ron SWMS [Ogasawara 2011</xref>
        ]. In [Silv
        <xref ref-type="bibr" rid="ref22">a 2018</xref>
        ], we remove the SWMS and
instantiate Armful as DfAnalyzer, a library of components to support online in-situ and
in-transit data analysis. DfAnalyzer components are plugged directly in the simulation
code of highly optimized parallel applications with negligible overhead. With support of
sophisticated online data analysis, scientists get a detailed view of the execution,
providing insights to determine when and how to tune pa
        <xref ref-type="bibr" rid="ref36 ref37 ref6">rameters [Souza 2017</xref>
        <xref ref-type="bibr" rid="ref22">a,
Camata 2018</xref>
        , Silv
        <xref ref-type="bibr" rid="ref22">a 2018</xref>
        ]. In [Souz
        <xref ref-type="bibr" rid="ref16 ref9">a 2017</xref>
        b] we evaluate a parameter sweep workflow
also in the Oil and Gas domain, this time using Spark to understand its scalability when
having to execute legacy black-box code with a DISC system. The source code of the
dataflow implementation for Spark is available on github
(github.com/hpcdb/RFASpark).
      </p>
      <p>
        We started investigating the combination of in-transit analysis and visualization,
with the development of SAVIME (Scientific Analysis and Visualization In-Memory).
The system adopts a multi-dimensional data model TARS (Typed A
        <xref ref-type="bibr" rid="ref36 ref37 ref6">rray Schema)
[Lustosa 2017</xref>
        ] that enables the representation of simulation output data, the topology
mesh and simulation metadata. Data produced by the simulation in the HPC is ingested
without any transformation as blocks of a Typed Array (TAR) in real-time into
SAVIME, running in a Big Data cluster system. The communication between the two
systems is implemented using an extended RDMA protocol that bridges the HPC
computing nodes memory with a cluster receiver fatnode memory. SAVIME offers a set
of high-level operators that manipulate in-memory multi-dimensional arrays, split into
blocks. Query results can be streamlined into Paraview for visualization in the cluster,
saving the HPC system from this extra load.
      </p>
      <p>
        Finally, we have devised techniques to efficiently assess the uncertainty in
simulation’s output. Our approach uses probabilistic distribution functions (PDF) to fit
the output of a parameter sweep study and replaces data by the best fitting PFD at each
point. Next, we may answer uncertainty quantification queries on spatio-time regions of
the simulation output using the PDFs instead of the replaced data. The PDF computing
strategy has been implemented using different appro
        <xref ref-type="bibr" rid="ref22">aches in Apache Spark [Liu 2018</xref>
        ].
      </p>
    </sec>
    <sec id="sec-4">
      <title>4. Data Mining of Scientific Data</title>
      <p>
        The current data deluge produced in scientific applications has fostered the
development of new knowledge discovery techniques. In this context, an interesting
problem raises when the studied phenomenon can be modeled as spatial-time series. The
investigation of spatial-time series may shed light on p
        <xref ref-type="bibr" rid="ref23 ref7">atterns (motifs) [Mueen 2014</xref>
        ]
and can be used in predicting fu
        <xref ref-type="bibr" rid="ref8">ture series behavior [Dhar 2013</xref>
        ]. In this context, we
focus on the design of new algorithms to harvest large datasets of space-time series
looking patterns that are relevant for the scientific domain studied (seismic, astronomy,
and sensor data sources). Such datasets can even appear distribu
        <xref ref-type="bibr" rid="ref1">ted on different sites
[Allard 2015</xref>
        ]. This work capitalizes on our previous results in data transformations
[Ogasawara 2010] and sequence minin
        <xref ref-type="bibr" rid="ref12">g [Campisano 2016</xref>
        ].
      </p>
      <p>
        In [C
        <xref ref-type="bibr" rid="ref16 ref9">ampisano 2017</xref>
        ], we tackle the problem of finding tight space-time
sequences, i.e., find within the same process: frequent sequences constrained in space
and time that may not be frequent in the entire dataset, and the time interval and space
range where these sequences are frequent. The discovery of such patterns along with
their constraints may lead to extract valuable knowledge that can remain hidden using
traditional methods since their support is extremely low over the entire dataset.
      </p>
      <p>
        We introduce a new spatiotemporal Sequence Miner (STSM) algorithm to
discover tight space-time sequences. We evaluate STSM using a seismic use case and
illustrate its ability to detect frequent sequences constrained in space and time. When
compared with general spatial-time sequence mining algorithms, STSM allows for new
insights by detecting maximal space-time areas where each pattern is frequent.
Additionally, in [C
        <xref ref-type="bibr" rid="ref36 ref37 ref6">ruz 2017</xref>
        ], we started studying sensor data sources using
spatialtemporal aggregations from trajectories of the buses of Rio de Janeiro. As a preliminary
work on this subject, we established a baseline for anomaly identification in urban
mobility, which may be useful for developing new approaches that help better discover
patterns and understand urban mobility systems.
      </p>
    </sec>
    <sec id="sec-5">
      <title>5. Machine Learning and Recommendation</title>
      <p>
        Scientists commonly explore several input data files and parameter values in different
executions of scientific workflows. These workflows can execute for days in DISC
environments and they are costly both in terms of execution time and financia
        <xref ref-type="bibr" rid="ref29">l cost [Liu
2016</xref>
        , 2017, Pineda-Mo
        <xref ref-type="bibr" rid="ref5">rales 2016</xref>
        ]. It is fundamental that input data files and parameter
values chosen for a specific workflow execution do not produce undesired results. In
addition, depending on how parameters are set, the workflow execution may present a
better performance. Today, scientists spend much time choosing appropriate parameter
values and data files based on their experience, but this is an error-prone task since
many of these parameters are not independent of each other, i.e., if one parameter is
modified, it may imply on changing the value of many other parameters of the
workflow. It is worth noticing that this parameter space opens room for parameter fine
tuning and consequently improvements both in performance and quality of results.
However, due to the (very) large parameter space, this parameter recommendation is an
open problem. Our proposal is to use provenance data captured during previous
workflow executions to recommend data files and parameters values for future
executions. We use Machine Learning a
        <xref ref-type="bibr" rid="ref31">lgorithms (ML) [Raedt 2008</xref>
        ] to predict which
data files and parameters are more suitable for an execution.
      </p>
      <p>
        We have developed a series of predictive models [Silva
        <xref ref-type="bibr" rid="ref20 ref4">Jr 2018</xref>
        ] in order to
identify which combinations of data files and parameters values produce results with
more quality and in less time. We use as input datasets provenance traces from SciPhy
(bioinformatics) and Montage (astronomy) workflows (workflows that we have access
to specialists that can inform how to measure quality of results). This way, we are able
to suggest “ideal” parameter values and data files for scientists that will produce results
with more quality and/or less time. These predictive models are based on traditional ML
algorithms such as Classification Trees, Support Vector Machines (SVM), One Class
SVM and Inductive Logic Programming (ILP). Each predictive model presents different
precision and accuracy, and it may be required to choose the best one before
recommending parameter values and data files to use. Thus, we plan to use user
feedback to fine-tune
        <xref ref-type="bibr" rid="ref1">the recommendation [Servajean 2015</xref>
        ], i.e., we have a 2-level
recommendation scenario. First, we have to recommend which predictive model to use
and then run this model with new data to finally recommend the parameter values and
data files for workflow executions. This combination of ML and feedback is novel when
compared with existing app
        <xref ref-type="bibr" rid="ref3">roaches [Ferro 2011</xref>
        , Huang 2013].
      </p>
    </sec>
    <sec id="sec-6">
      <title>6. Conclusion</title>
      <p>The SciDISC project addresses the grand challenge of scientific data analysis using
DISC, by developing architectures and methods to combine simulation and data
analysis. In this paper, we introduced the motivations and objectives of the project, and
reported on the first results achieved so far in terms of generic architecture, interactive
analysis of simulation data and visualization, data mining of scientific data, and machine
learning and recommendation.</p>
      <p>The first results are quite encouraging and lead to exiting future work. Based on
in-situ data extraction and analysis, we plan to improve our dataflow monitoring,
debugging and extend our support for adaptation at runtime like parameter fine-tuning
and data reduction. We will also continue the development of the SAVIME system. The
aim is to compute almost in real-time simulation output analysis and ready to be
consumed visualization output. Regarding post-processing of simulation data, we will
continue to study Spark, one of the most popular DISC systems, and explore it as a
platform for efficiently computing probability distribution functions on numerical
simulation output during a parameter sweep exploration. We will pursue our work on
data mining of spatial-time series in two main areas: compare motif identification
techniques with sequence mining techniques and explore spatial-temporal aggregation
techniques of sensor data to enable spatiotemporal pattern mining. Regarding ML and
recommendation, we have developed a series of predictive models to suggest parameter
values and data files for workflow executions. Since these models present different
accuracy and precision, it may be difficult to choose a specific model to predict
parameters and data files. Thus, we propose to develop a recommendation system that
will allow for users to choose the best predictive model based on opinions of colleagues
and other users, and on the performance of such predictive models on previous
recommendations. This recommendation process is being implemented within the
SciManager system (www.scimanager.ic.uff.br).</p>
    </sec>
    <sec id="sec-7">
      <title>7. Acknowledgements</title>
      <p>This work was partially funded by CNPq, FAPERJ and Inria (SciDISC project), EU
H2020 Programme and MCTI/RNP-Brazil (HPC4E grant no. 689772), and performed
(for Inria) in the context of the Computational Biology Institute
(www.ibcmontpellier.fr). The experiments in SciDISC are carried out using the Inria Grid'5000
testbed (www.grid5000.fr), NACAD/COPPE supercomputers and LNCC SINAPAD
Santos Dumont supercomputer (sdumont.lncc.br).</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <mixed-citation>
          <string-name>
            <given-names>T.</given-names>
            <surname>Allard</surname>
          </string-name>
          ,
          <string-name>
            <given-names>G.</given-names>
            <surname>Hébrail</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Masseglia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          . Chiaroscuro:
          <article-title>Transparency and Privacy for Massive Personal Time-Series Clustering</article-title>
          . SIGMOD Conference,
          <volume>779</volume>
          -
          <fpage>794</fpage>
          ,
          <year>2015</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref2">
        <mixed-citation>
          <string-name>
            <given-names>C.</given-names>
            <surname>Bondiombouy</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          .
          <source>Query Processing in Multistore Systems. Int. Journal of Cloud Computing</source>
          ,
          <volume>5</volume>
          (
          <issue>4</issue>
          ):
          <fpage>309</fpage>
          -
          <lpage>346</lpage>
          ,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref3">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Bryant</surname>
          </string-name>
          .
          <article-title>Data-Intensive Scalable Computing for Scientific Applications</article-title>
          . Computing in Science &amp; Engineering,
          <volume>13</volume>
          (
          <issue>6</issue>
          ):
          <fpage>25</fpage>
          -
          <lpage>33</lpage>
          ,
          <year>2011</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref4">
        <mixed-citation>
          <string-name>
            <given-names>J.</given-names>
            <surname>Camata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Coutinho</surname>
          </string-name>
          .
          <source>In Situ Visualization and Data Analysis for Turbidity Currents Simulation. Computers &amp; Geosciences</source>
          , 110, pp.
          <fpage>23</fpage>
          -
          <lpage>31</lpage>
          ,
          <year>2018</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref5">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Campisano</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Masseglia</surname>
          </string-name>
          , and
          <string-name>
            <given-names>E.</given-names>
            <surname>Ogasawara</surname>
          </string-name>
          .
          <article-title>Spatial Sequential Pattern Mining for Seismic Data</article-title>
          .
          <source>SBBD Conference</source>
          ,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref6">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Campisano</surname>
          </string-name>
          ,
          <article-title>Sequence Mining in Spatial-Time Series (Master Degree Dissertation)</article-title>
          ,
          <source>CEFET/RJ</source>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref7">
        <mixed-citation>
          <string-name>
            <given-names>A.</given-names>
            <surname>Coutinho</surname>
          </string-name>
          .
          <source>Computational Science and Big Data: Where are We Now? XLDB Workshop</source>
          , http://xldbrio2014.linea.gov.br/program,
          <year>2014</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref8">
        <mixed-citation>
          <string-name>
            <given-names>T.</given-names>
            <surname>Critchlow</surname>
          </string-name>
          , K. Kleese van Dam.
          <article-title>Data-Intensive Science</article-title>
          . Chapman and Hall/CRC,
          <year>2013</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref9">
        <mixed-citation>
          <string-name>
            <surname>A.B. Cruz</surname>
            ,
            <given-names>J.</given-names>
          </string-name>
          <string-name>
            <surname>Ferreira</surname>
            ,
            <given-names>B.</given-names>
          </string-name>
          <string-name>
            <surname>Monteiro</surname>
            ,
            <given-names>R.</given-names>
          </string-name>
          <string-name>
            <surname>Coutinho</surname>
            ,
            <given-names>F.</given-names>
          </string-name>
          <string-name>
            <surname>Porto</surname>
          </string-name>
          , E. Ogasawara, Detecção de Anomalias no Transporte
          <issue>Rodoviário Urbano</issue>
          ,
          <source>Brazilian Symposium on Databases (SBBD)</source>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref10">
        <mixed-citation>
          <string-name>
            <given-names>V.</given-names>
            <surname>Dhar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>Data</given-names>
            <surname>Science</surname>
          </string-name>
          and Prediction.
          <source>Comm. of ACM</source>
          ,
          <volume>56</volume>
          (
          <issue>12</issue>
          ):
          <fpage>64</fpage>
          -
          <lpage>73</lpage>
          ,
          <year>2013</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref11">
        <mixed-citation>
          <string-name>
            <given-names>M.</given-names>
            <surname>Ferro</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A. R.</given-names>
            <surname>Mury</surname>
          </string-name>
          , and
          <string-name>
            <given-names>B.</given-names>
            <surname>Schulze</surname>
          </string-name>
          .
          <article-title>A proposal to apply inductive logic programming to self-healing problem in grid computing: How will it work? Concurrency and Computation Practice</article-title>
          and Experience,
          <volume>23</volume>
          :
          <fpage>2118</fpage>
          -
          <lpage>2135</lpage>
          ,
          <year>2011</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref12">
        <mixed-citation>
          <string-name>
            <given-names>G.</given-names>
            <surname>Fox</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Qiu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Jha</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Ekanayake</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Kamburugamuve</surname>
          </string-name>
          .
          <article-title>Big Data, Simulations and</article-title>
          HPC Convergence. https://www.researchgate.net/publication/301231174,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref13">
        <mixed-citation>
          <string-name>
            <given-names>D.</given-names>
            <surname>Gaspar</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Akbarinia</surname>
          </string-name>
          , E. Pacitti, TARDIS:
          <article-title>Optimal Execution of Scientific Workflows in Apache Spark</article-title>
          .
          <source>Int. Conf. on Big Data Analytics and Knowledge Discovery (DaWaK)</source>
          ,
          <fpage>74</fpage>
          -
          <lpage>87</lpage>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref14">
        <mixed-citation>
          <string-name>
            <given-names>T.</given-names>
            <surname>Hey</surname>
          </string-name>
          .
          <article-title>The Fourth Paradigm: Data-Intensive Scientific Discovery</article-title>
          .
          <source>Microsoft Research</source>
          ,
          <year>2009</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref15">
        <mixed-citation>
          <string-name>
            <given-names>X.</given-names>
            <surname>Huang</surname>
          </string-name>
          ,
          <string-name>
            <given-names>T.</given-names>
            <surname>Lu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>X.</given-names>
            <surname>Ding</surname>
          </string-name>
          , and
          <string-name>
            <given-names>N.</given-names>
            <surname>Gu</surname>
          </string-name>
          .
          <source>Enabling Data Recommendation in Scientific Workflow Based on Provenance. 8th China Grid Annual Conference 1-8</source>
          ,
          <year>2013</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref16">
        <mixed-citation>
          <string-name>
            <given-names>A.</given-names>
            <surname>Khatibi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Rittmeyer</surname>
          </string-name>
          , E. Ogasawara,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Shasha</surname>
          </string-name>
          .
          <article-title>Pre-processing and Indexing Techniques for Constellation Queries in Big Data</article-title>
          .
          <source>Int. Conf. on Big Data Analytics and Knowledge Discovery (DaWaK)</source>
          ,
          <fpage>164</fpage>
          -
          <lpage>172</lpage>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref17">
        <mixed-citation>
          <string-name>
            <given-names>M.</given-names>
            <surname>Liroz-Gistau</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Akbarinia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          .
          <article-title>Dynamic Worlkload-based Partitioning Algorithms for Continuously Growing Databases</article-title>
          .
          <source>Trans on Large-Scale Data and KnowledgeCentered Systems</source>
          , Springer,
          <volume>12</volume>
          :
          <fpage>105</fpage>
          -
          <lpage>128</lpage>
          ,
          <year>2013</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref18">
        <mixed-citation>
          <string-name>
            <given-names>J.</given-names>
            <surname>Liu</surname>
          </string-name>
          , E. Pacitti,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          , D. de Oliveira,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          <article-title>. Multi-Objective Scheduling of Scientific Workflows in Multisite Clouds</article-title>
          .
          <source>Future Generation Computer Systems</source>
          , Elsevier,
          <volume>63</volume>
          :
          <fpage>76</fpage>
          -
          <lpage>95</lpage>
          ,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref19">
        <mixed-citation>
          <string-name>
            <given-names>J.</given-names>
            <surname>Liu</surname>
          </string-name>
          , E. Pacitti,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Scientific Workflow Scheduling with Provenance Data in a Multisite Cloud</article-title>
          . Trans.
          <article-title>on Large-Scale Data-</article-title>
          and
          <string-name>
            <surname>Knowledge-Centered Systems</surname>
          </string-name>
          (TLDKS),
          <volume>33</volume>
          :
          <fpage>80</fpage>
          -
          <lpage>112</lpage>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref20">
        <mixed-citation>
          <string-name>
            <given-names>J.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E. N.</given-names>
            <surname>Lemus</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <surname>P.</surname>
          </string-name>
          <article-title>Valduriez, Parallel Computation of PDFs on Big Spatial Data Using Spark</article-title>
          , arXiv:
          <year>1805</year>
          .03141,
          <year>2018</year>
        </mixed-citation>
      </ref>
      <ref id="ref21">
        <mixed-citation>
          <string-name>
            <given-names>H.</given-names>
            <surname>Lustosa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Lemus</surname>
          </string-name>
          , P. Valduriez, TARS:
          <article-title>Na Extension of the Multi-dimensional Array Model</article-title>
          ,
          <string-name>
            <surname>ER FORUM - Conceptual</surname>
            <given-names>Modeling</given-names>
          </string-name>
          : Research In Progress, Valencia,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref22">
        <mixed-citation>
          <string-name>
            <given-names>A.</given-names>
            <surname>Matheus</surname>
          </string-name>
          ,
          <string-name>
            <given-names>H.</given-names>
            <surname>Lustosa</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>B.</given-names>
            <surname>Schulze</surname>
          </string-name>
          , Towards In-transit
          <source>Analysis on Supercomputing Environments</source>
          , arXiv:
          <year>1805</year>
          .06425,
          <year>2018</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref23">
        <mixed-citation>
          <string-name>
            <given-names>A.</given-names>
            <surname>Mueen</surname>
          </string-name>
          .
          <article-title>Time series motif discovery: Dimensions and applications</article-title>
          .
          <source>Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery</source>
          ,
          <volume>4</volume>
          (
          <issue>2</issue>
          ):
          <fpage>152</fpage>
          -
          <lpage>159</lpage>
          ,
          <year>2014</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref24">
        <mixed-citation>
          <string-name>
            <given-names>E.</given-names>
            <surname>Ogasawara</surname>
          </string-name>
          ,
          <string-name>
            <given-names>L.C.</given-names>
            <surname>Martinez</surname>
          </string-name>
          , D. de Oliveira, G. Zimbrao,
          <string-name>
            <given-names>G.L.</given-names>
            <surname>Pappa</surname>
          </string-name>
          , and
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Adaptive Normalization: A novel data normalization approach for non-stationary time series</article-title>
          .
          <source>Int. Joint Conf. on Neural Networks (IJCNN)</source>
          ,
          <fpage>1</fpage>
          -
          <lpage>8</lpage>
          ,
          <year>2010</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref25">
        <mixed-citation>
          <string-name>
            <given-names>E.</given-names>
            <surname>Ogasawara</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Dias</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Oliveira</surname>
          </string-name>
          ,
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>An Algebraic Approach for Datacentric Scientific Workflows</article-title>
          .
          <source>Proceedings of the VLDB Endowment (PVLDB)</source>
          ,
          <volume>4</volume>
          (
          <issue>12</issue>
          ):
          <fpage>1328</fpage>
          -
          <lpage>1339</lpage>
          ,
          <year>2011</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref26">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Oldfield</surname>
          </string-name>
          ,
          <string-name>
            <given-names>K.</given-names>
            <surname>Moreland</surname>
          </string-name>
          ,
          <string-name>
            <given-names>N.</given-names>
            <surname>Fabian</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Rogers</surname>
          </string-name>
          .
          <article-title>Evaluation of Methods to Integrate Analysis into a Large-Scale Shock Physics Code</article-title>
          .
          <source>ACM Int. Conf. on Supercomputing</source>
          ,
          <volume>83</volume>
          -
          <fpage>92</fpage>
          ,
          <year>2014</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref27">
        <mixed-citation>
          <string-name>
            <given-names>T.</given-names>
            <surname>Özsu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          .
          <source>Principles of Distributed Database Systems - Third Edition</source>
          . Springer, 850 p,
          <year>2011</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref28">
        <mixed-citation>
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Akbarinia</surname>
          </string-name>
          ,
          <string-name>
            <surname>M.</surname>
          </string-name>
          <article-title>El Dick: P2P Techniques for Decentralized Applications</article-title>
          .
          <source>Synthesis Lectures on Data Management</source>
          , Morgan &amp; Claypool Publishers,
          <year>2012</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref29">
        <mixed-citation>
          <string-name>
            <given-names>L.</given-names>
            <surname>Pineda-Morales</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Liu</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Costany</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E</given-names>
            <surname>Pacitti</surname>
          </string-name>
          , G. Antoniu,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Managing hot metadata for scientific workflows on multisite clouds</article-title>
          .
          <source>IEEE BigData Conf</source>
          ,
          <fpage>390</fpage>
          -
          <lpage>397</lpage>
          ,
          <year>2016</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref30">
        <mixed-citation>
          <string-name>
            <given-names>F.</given-names>
            <surname>Porto</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Khatibi</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Nobre</surname>
          </string-name>
          , E. Ogasawara,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Shasha</surname>
          </string-name>
          .
          <article-title>Point Pattern Search in Big Data</article-title>
          .
          <source>Int. Conf. on Scientific and Statistical Database Management (SSDBM)</source>
          ,
          <year>2018</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref31">
        <mixed-citation>
          <string-name>
            <given-names>L. D.</given-names>
            <surname>Raedt</surname>
          </string-name>
          .
          <article-title>Logical and Relational Learning: From ILP to MRDM (Cognitive Technologies)</article-title>
          . Springer, New York,
          <year>2008</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref32">
        <mixed-citation>
          <string-name>
            <given-names>M.</given-names>
            <surname>Servajean</surname>
          </string-name>
          ,
          <string-name>
            <given-names>R.</given-names>
            <surname>Akbarinia</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>S.</given-names>
            <surname>Amer-Yahia</surname>
          </string-name>
          .
          <article-title>Profile Diversity for Query Processing using User Recommendations</article-title>
          .
          <source>Information Systems</source>
          ,
          <volume>48</volume>
          :
          <fpage>44</fpage>
          -
          <lpage>63</lpage>
          ,
          <year>2015</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref33">
        <mixed-citation>
          <string-name>
            <given-names>V.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Leite</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Camata</surname>
          </string-name>
          , D. de Oliveira,
          <string-name>
            <given-names>A.</given-names>
            <surname>Coutinho</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Raw data queries during data-intensive parallel workflow execution</article-title>
          .
          <source>Future Generation Computer Systems</source>
          , Elsevier,
          <volume>75</volume>
          :
          <fpage>402</fpage>
          -
          <lpage>422</lpage>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref34">
        <mixed-citation>
          <string-name>
            <given-names>V.</given-names>
            <surname>Silva</surname>
          </string-name>
          , D. de Oliveira, P. Valduriez,
          <string-name>
            <surname>M. Mattoso.</surname>
          </string-name>
          <article-title>DfAnalyzer: Runtime Dataflow Analysis of Scientific Applications using Provenance</article-title>
          ,
          <source>Proceedings of the VLDB Endowment</source>
          ,
          <year>2018</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref35">
        <mixed-citation>
          <string-name>
            <given-names>D. Silva</given-names>
            <surname>Jr.</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Paes</surname>
          </string-name>
          ,
          <string-name>
            <given-names>E.</given-names>
            <surname>Pacitti</surname>
          </string-name>
          ,
          <string-name>
            <given-names>D.</given-names>
            <surname>Oliveira</surname>
          </string-name>
          .
          <article-title>Data Quality Prediction in Scientific Workflows</article-title>
          . In preparation,
          <year>2018</year>
        </mixed-citation>
      </ref>
      <ref id="ref36">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Souza</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Miranda</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Lima</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Spark Scalability Analysis in a Scientific Workflow</article-title>
          .
          <source>Brazilian Symposium on Databases (SBBD)</source>
          ,
          <source>Best Paper Award</source>
          ,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref37">
        <mixed-citation>
          <string-name>
            <given-names>R.</given-names>
            <surname>Souza</surname>
          </string-name>
          ,
          <string-name>
            <given-names>V.</given-names>
            <surname>Silva</surname>
          </string-name>
          ,
          <string-name>
            <given-names>J.</given-names>
            <surname>Camata</surname>
          </string-name>
          ,
          <string-name>
            <given-names>A.</given-names>
            <surname>Coutinho</surname>
          </string-name>
          ,
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          ,
          <string-name>
            <given-names>M.</given-names>
            <surname>Mattoso</surname>
          </string-name>
          .
          <article-title>Tracking of Online Parameter Fine-tuning in Scientific Workflows</article-title>
          . Workshop on Workflows in
          <source>Support of Large-Scale Science (WORKS)</source>
          , ACM/IEEE Supercomputing Conference,
          <year>2017</year>
          .
        </mixed-citation>
      </ref>
      <ref id="ref38">
        <mixed-citation>
          <string-name>
            <given-names>P.</given-names>
            <surname>Valduriez</surname>
          </string-name>
          .
          <article-title>Data-intensive HPC: opportunities and challenges. Big Data and Extreme-scale computing (BDEC)</article-title>
          , http://hal-lirmm.
          <source>ccsd.cnrs.fr/lirmm-01184018</source>
          ,
          <year>2015</year>
          .
        </mixed-citation>
      </ref>
    </ref-list>
  </back>
</article>