# [Volute] r3628 - trunk/projects/dm/provenance/description

Volute commit messages volutecommits at g-vo.org
Sat Oct 15 00:38:49 CEST 2016

Author: kriebe
Date: Sat Oct 15 00:38:49 2016
New Revision: 3628

Log:
Renamed ProvDm => ProvenanceDM, included new modelio figures, minor rewrites at several places

trunk/projects/dm/provenance/description/ProvenanceDM.pdf
- copied, changed from r3626, trunk/projects/dm/provenance/description/ProvDM.pdf
trunk/projects/dm/provenance/description/ProvenanceDM.tex
- copied, changed from r3626, trunk/projects/dm/provenance/description/ProvDM.tex
Deleted:
trunk/projects/dm/provenance/description/ProvDM.pdf
trunk/projects/dm/provenance/description/ProvDM.tex
Modified:
trunk/projects/dm/provenance/description/Makefile
trunk/projects/dm/provenance/description/datamodel-description.tex
trunk/projects/dm/provenance/description/datamodel-discussion.tex
trunk/projects/dm/provenance/description/intro-VOarchitecture.tex
trunk/projects/dm/provenance/description/intro-general.tex
trunk/projects/dm/provenance/description/intro-previousefforts.tex
trunk/projects/dm/provenance/description/prov-refs.bib
trunk/projects/dm/provenance/description/usecases-implementations.tex

Modified: trunk/projects/dm/provenance/description/Makefile
==============================================================================
--- trunk/projects/dm/provenance/description/Makefile	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/Makefile	Sat Oct 15 00:38:49 2016	(r3628)
@@ -1,7 +1,7 @@
# ivoatex Makefile.  The ivoatex/README for the targets available.

# short name of your document (edit \$DOCNAME.tex; would be like RegTAP)
-DOCNAME = ProvDM
+DOCNAME = ProvenanceDM

# count up; you probably do not want to bother with versions <1.0
DOCVERSION = 0.3

Copied and modified: trunk/projects/dm/provenance/description/ProvenanceDM.pdf (from r3626, trunk/projects/dm/provenance/description/ProvDM.pdf)
==============================================================================
Binary file (source and/or target). No diff available.

Copied and modified: trunk/projects/dm/provenance/description/ProvenanceDM.tex (from r3626, trunk/projects/dm/provenance/description/ProvDM.tex)
==============================================================================
--- trunk/projects/dm/provenance/description/ProvDM.tex	Fri Oct 14 23:55:10 2016	(r3626, copy source)
+++ trunk/projects/dm/provenance/description/ProvenanceDM.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -10,8 +10,8 @@

\author{Kristin Riebe}
\author{Michèle Sanguillon}
-\author{Florian Rothmaier}
\author{Mathieu Servillat}
+\author{Florian Rothmaier}
\author{Mireille Louys}
\author{François Bonnarel}

@@ -92,7 +92,7 @@
\section*{Acknowledgments}

This document has been developed in part with support from the German
-Astrophysical Virtual Observatory (BMBF Bewilligungsnummer 05A08VHA).
+Astrophysical Virtual Observatory, funded by BMBF Bewilligungsnummer 05A08VHA.

Thanks for fruitful discussions to (in alphabetical order):
François Bonnarel, Markus Demleitner, Jochen Klar, Gerard Lemson,
@@ -131,6 +131,13 @@
other VO data models (e.g. ObscoreDM, DatasetDM, SpectralDM, SimDM) and how provenance information can be stored.

+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{../datamodel-diagrams/classes-relations-dms}
+\caption{Links between Agent and Party, Entity and Dataset.}
+\label{fig:class-relations-dm}
+\end{figure}
+
\TODO{Put this in appendix? Or in datamodel-description? Or even in introduction?}

Modified: trunk/projects/dm/provenance/description/datamodel-description.tex
==============================================================================
--- trunk/projects/dm/provenance/description/datamodel-description.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/datamodel-description.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -10,15 +10,12 @@

\begin{figure}[h]
\centering
-\includegraphics[width=1.0\textwidth]{../datamodel-diagrams/2016-05-03_IVOA_ProvenanceDM.png}
-\caption{Class diagram for the provenance data model. The blue classes are core
-elements. Their names match the corresponding counterparts in the W3C provenance
-data model.}
+\includegraphics[width=1.0\textwidth]{../datamodel-diagrams/classes-overview}
+\caption{Overview of the classes for the provenance data model in a class diagram. The blue classes are core elements. Their names match the corresponding counterparts in the W3C provenance
+data model. Green classes belong to other IVOA classes (IVOA Dataset Data Model)}
\label{fig:classdiagram}
\end{figure}

-\TODO{Produce Modelio version of the data model, update!}
-

\subsection{Main classes}\label{sec:core}
% Some examples for different use cases are given in Section \ref{sec:usecases-implementations}.
@@ -26,7 +23,7 @@

\begin{figure}[h]
\centering
-\includegraphics[width=0.8\textwidth]{../datamodel-diagrams/ProvDM-core-diagram.png}
+\includegraphics[scale=0.8]{../datamodel-diagrams/classes-core-w3c}
\caption{The main core classes and relations of the Provenance Data Model, which also occur in the W3C model.}
\label{fig:coreclasses}
\end{figure}
@@ -103,13 +100,14 @@
table, an image or a collection of them. The Dataset Data Model
\citep{std:DatasetDM} specifies an IVOA Dataset'' as a file or files which
are considered to be a single deliverable''. We adopt this definition here and
-define \class{Dataset} as a subclass to \class{Entity}, as shown in
+link \class{Dataset} and \class{Entity} via a composition relation, as shown in
Figure \ref{fig:entityclasses}.

\begin{figure}[h]
\centering
-\includegraphics[width=0.8\textwidth]{../datamodel-diagrams/ProvDM-entity-classes.png}
-\caption{The Entity class and related subclasses}
+\includegraphics[scale=0.5]{../datamodel-diagrams/classes-entity-collection}
+\caption{The relation between Entity, Dataset and Collection. The Dataset class belongs to
+the IVOA Dataset Data Model.}
\label{fig:entityclasses}
\end{figure}

@@ -138,7 +136,6 @@
}\label{tab:entity-attributes}
\end{table}

-
\begin{table}[h]
\small
\tymax	0.5\textwidth
@@ -231,6 +228,8 @@
\TODO{Do we allow empty collections? Or should collections always contain at least 1 member? (otherwise they are just prov:entities?)}

+
+
The entity-collection relation can be modeled using the \emph{Composite} design pattern:
Collection is a subclass of Entity, but also an aggregation of 1 to many entities,
which could be collections themselves.
@@ -552,76 +551,3 @@

-\subsection{Discussion}
-
-
-It would be convenient, if each data object or even each file
-gets a unique id that can be referenced. The W3C provenance model requires ids
-for entities, activities and agents, and they have to be qualified strings,
-i.e. containing a namespace. For example, an activity in the RAVE-pipeline could
-have the id \texttt{'rave:radialvelocity\_pipeline'}. Using a namespace for each
-project for these ids will help to make them unique.
-
-If several copies of a dataset exist, and one of them is corrupted, it would even be useful to know
-exactly which copy was used by a given activity. This can be modeled already
-with the existing tools (using a copy-activity), but we doubt that many people
-would actually need this level of detail.
-
-\TODO{What about DOI's for datasets? They should be unique. Maybe add another
-
-
-\subsubsection{Calibration data}
-The calibration dataset consists of images that can be used to calibrate the
-raw data. It is not necessary to mention them explicitly in the model,
-they are just another dataset that is used by activities with a
-calibration-method.
-
-
-\subsubsection{Parameters}
-We consider adding a parameter class for describing additional properties of activities.
-
-For example for observations, the \emph{ambient conditions} as well as
-\emph{instrument characteristics} need to be stored. But they can both be treated
-Our model can then also take into account that a certain observation
-method requires special ambient conditions, already defined via the
-ActivityDescription (e.g. radio observations rely on different ambient
-conditions than observations
-of gamma rays), just following our data -- data description scheme.
-Ambient conditions are recorded for a certain time (startTime, endTime) and are
-usually only valid for a certain time interval. This time interval should be recorded
-with a \emph{validity}-attribute for such entities.
-
-In contrast to ambient conditions, instrument characteristics do (usually) not
-change from one observation to the other, so they are static, strictly related to
-the instrument.
-All the characteristics could be described either as key-value pairs directly with the
-observation (as attributes) or just as datasets, using the \class{Entity} class.
-One would then
-link the instrument characteristics as a type of input (or output?) dataset to a certain
-observation activity. Thus we don't need a separate Instrument or Device class.
-
-\Note{One should also keep in mind that some instrument related parameters can change within time,
-e.g. the CCD temperature. The instruments can also change within time because of aging.}
-
-
-\subsubsection{Quality}
-For expressing the quality of data, we could simply define additional
-attributes for each \class{Activity}
-or \class{DataEntity} object, i.e. zero, one, or more properties in the form of
-key-value pairs. We could use a \class{Quality} namespace to mark a keyword
-as quality-related:
-\begin{itemize}
-    \item quality:comment: [some text]
-    \item quality:seeing: [some value]
-\end{itemize}
-The values could range from a float number to free text.
-
-
-\subsubsection{Provenance of provenance}
-Bundles'' are used to name a set of provenance descriptions. It is a type for
-an entity, and allows to express provenance of provenance. This is probably also
-very interestíng for workflow systems.
-

Modified: trunk/projects/dm/provenance/description/datamodel-discussion.tex
==============================================================================
--- trunk/projects/dm/provenance/description/datamodel-discussion.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/datamodel-discussion.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -1,3 +1,76 @@
+\subsection{Discussion}
+
+
+It would be convenient, if each data object or even each file
+gets a unique id that can be referenced. The W3C provenance model requires ids
+for entities, activities and agents, and they have to be qualified strings,
+i.e. containing a namespace. For example, an activity in the RAVE-pipeline could
+have the id \texttt{'rave:radialvelocity\_pipeline'}. Using a namespace for each
+project for these ids will help to make them unique.
+
+If several copies of a dataset exist, and one of them is corrupted, it would even be useful to know
+exactly which copy was used by a given activity. This can be modeled already
+with the existing tools (using a copy-activity), but we doubt that many people
+would actually need this level of detail.
+
+\TODO{What about DOI's for datasets? They should be unique. Maybe add another
+
+
+\subsubsection{Calibration data}
+The calibration dataset consists of images that can be used to calibrate the
+raw data. It is not necessary to mention them explicitly in the model,
+they are just another dataset that is used by activities with a
+calibration-method.
+
+
+\subsubsection{Parameters}
+We consider adding a parameter class for describing additional properties of activities.
+
+For example for observations, the \emph{ambient conditions} as well as
+\emph{instrument characteristics} need to be stored. But they can both be treated
+Our model can then also take into account that a certain observation
+method requires special ambient conditions, already defined via the
+ActivityDescription (e.g. radio observations rely on different ambient
+conditions than observations
+of gamma rays), just following our data -- data description scheme.
+Ambient conditions are recorded for a certain time (startTime, endTime) and are
+usually only valid for a certain time interval. This time interval should be recorded
+with a \emph{validity}-attribute for such entities.
+
+In contrast to ambient conditions, instrument characteristics do (usually) not
+change from one observation to the other, so they are static, strictly related to
+the instrument.
+All the characteristics could be described either as key-value pairs directly with the
+observation (as attributes) or just as datasets, using the \class{Entity} class.
+One would then
+link the instrument characteristics as a type of input (or output?) dataset to a certain
+observation activity. Thus we don't need a separate Instrument or Device class.
+
+\Note{One should also keep in mind that some instrument related parameters can change within time,
+e.g. the CCD temperature. The instruments can also change within time because of aging.}
+
+
+\subsubsection{Quality}
+For expressing the quality of data, we could simply define additional
+attributes for each \class{Activity}
+or \class{DataEntity} object, i.e. zero, one, or more properties in the form of
+key-value pairs. We could use a \class{Quality} namespace to mark a keyword
+as quality-related:
+\begin{itemize}
+    \item quality:comment: [some text]
+    \item quality:seeing: [some value]
+\end{itemize}
+The values could range from a float number to free text.
+
+
+\subsubsection{Provenance of provenance}
+Bundles'' are used to name a set of provenance descriptions. It is a type for
+an entity, and allows to express provenance of provenance. This is probably also
+very interestíng for workflow systems.
+
\subsubsection{Discussion of descripton side}
This model was established with mainly having a database implementation in mind.
However, it may be better in the long run to store provenance with

Modified: trunk/projects/dm/provenance/description/intro-VOarchitecture.tex
==============================================================================
--- trunk/projects/dm/provenance/description/intro-VOarchitecture.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/intro-VOarchitecture.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -2,10 +2,6 @@
The IVOA Provenance Data Model is adding metadata to trace the orignal process followed during the data production to provide astronomical data. Even if it borrows the main general concepts defined in the data management science, it binds to the specific context of astronomical metadata description and re-uses or interacts with existing IVOA models.
It takes benefits from existing IVOA notations and standards like UCD, VOUnits, VO protocols and service design and is planned for a full integration into the VO landscape.

-\TODO{Will be inserted later.}
-% Skipping this for now. Let's not let this draft look more official than it
-% currently is.
-
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth]{VOArchitecture-Prov2016.png}

Modified: trunk/projects/dm/provenance/description/intro-general.tex
==============================================================================
--- trunk/projects/dm/provenance/description/intro-general.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/intro-general.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -1,7 +1,7 @@
In this document, we discuss a draft for an IVOA standard data model for
describing the provenance of data. We focus on observational data, since
provenance for simulated data is already covered by the Simulation Data Model
-(SimDM \citep{std:SimDM}). However, the version currently discussed is
+\citep[SimDM][]{std:SimDM}. However, the version currently discussed is
sufficiently abstract, so that its core pattern could be applied to any kind
of process, including extraction of data from
databases or even the flow of scientific proposals from application to
@@ -23,7 +23,7 @@

In general, the model shall enable a scientist who has no prior knowledge about
a dataset to get more
-background information. This will help the scientist decide if the dataset
+background information. This will help the scientist to decide if the dataset
is adequate for his research goal, judge its quality and get enough information
to be able to trace back its history as far as possible.

Modified: trunk/projects/dm/provenance/description/intro-previousefforts.tex
==============================================================================
--- trunk/projects/dm/provenance/description/intro-previousefforts.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/intro-previousefforts.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -1,5 +1,5 @@
\subsection{Previous efforts}
-The provenance concept was early introduced by the IVOA within the scope of the Observation Data Model (ref1 : IVOA note 2005) as a class  describing where the data are coming from. A full observation data model dedicated to the specific spectral data was then designed (Ref2 : spectral data model) as well as a fully generic characterisation data model of the measureemnt axes of the data (ref3: characterisation data model) while the progress on the provenance data model were slowing down.
+The provenance concept was early introduced by the IVOA within the scope of the Observation Data Model (ref1 : IVOA note 2005) as a class describing where the data are coming from. A full observation data model dedicated to the specific spectral data was then designed (Ref2 : spectral data model) as well as a fully generic characterisation data model of the measureemnt axes of the data (ref3: characterisation data model) while the progress on the provenance data model were slowing down.

IVOA DM WG first gathered various use cases coming from different communities of observational  astronomy (optical,  radio, Xray, interferometry). Common motivations for a provenance tracing of the history included : quality assesment, discovery of dataset progenitors and access to metadata necessary for reprocessing. Provenance datamodel was then designed as the combination of Data processing, Observing Configuration and Observation ambiant conditions datamodel classes.
The Processing class was embedding a sequence of processing stages which were hooking specific ad hoc details and links to input and output datasets, as well as processing step description.
@@ -11,6 +11,6 @@
The W3C model was taken up by a larger number of applications and tools than OPM, we are therefore basing our modeling efforts on the W3C Provenance data model, making it less abstract and more specific, or extending it where necessary.

-The W3C model even already specifies PROV-DM Extensibility points (section 6 in \cite{std:W3CProvDM}) for extending the core model. This allows to specify additional roles and types to each entity, agent or relation using the attributes \texttt{prov:type} and \texttt{prov:role}.
-By specifying the allowed values for the IVOA model, we could adjust the model to our needs while still being compliant to W3C.
+The W3C model even already specifies PROV-DM Extensibility points (section 6 in \cite{std:W3CProvDM}) for extending the core model. This allows one to specify additional roles and types to each entity, agent or relation using the attributes \texttt{prov:type} and \texttt{prov:role}.
+By specifying the allowed values for the IVOA model, we can adjust the model to our needs while still being compliant to W3C.

Modified: trunk/projects/dm/provenance/description/prov-refs.bib
==============================================================================
--- trunk/projects/dm/provenance/description/prov-refs.bib	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/prov-refs.bib	Sat Oct 15 00:38:49 2016	(r3628)
@@ -1,12 +1,21 @@
- at misc{std:SimDM,
-    author = {Gerard Lemson and Laurent Bourg{\e}s and Miguel Cervi{\~n}o and Claudio Gheller and Norman Gray and Franck LePetit and Mireille Louys and Benjamin Ooghe and Rick Wagner and Herv{\'e} Wozniak},
-    title = {Simulation Data Model, Version 1.0},
-    howpublished = {{IVOA Recommendation}},
-    month =        may,
-    year =         2012,
-    url =          {http://www.ivoa.net/documents/SimDM/}
+ at MISC{std:SimDM,
+   author = {{Lemson}, G. and {Wozniak}, H. and {Bourges}, L. and {Cervino}, M. and
+    {Gheller}, C. and {Gray}, N. and {LePetit}, F. and {Louys}, M. and
+    {Ooghe}, B. and {Wagner}, R.},
+    title = "{Simulation Data Model Version 1.0}",
+howpublished = {IVOA Recommendation 03 May 2012},
+     year = 2012,
+    month = may,
+archivePrefix = "arXiv",
+   eprint = {1402.4744},
+ primaryClass = "astro-ph.IM",
+   editor = {{Lemson}, G. and {Wozniak}, H.},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
}

+
+
@misc{lemson08,
author = {Gerard Lemson Herv{\'e} Wozniak Rick Wagner Claudio Gheller Laurent Bourg{\e}s},
title = {Proposal for a Simulation Database Standard, Version 1.00},
@@ -53,11 +62,3 @@
url = {http://www.ivoa.net/documents/latest/RM.html}
}

- at misc{std:ObsCore,
-    author = {Mireille Louys and Doug Tody and Patrick Dowler and Daniel Durand and Laurent Michel and Francois Bonnarel and Alberto Micol and the IVOA DataModel working group},
-    title = {Observation Data Model Core Components and its Implementation in the Table Access Protocol},
-    howpublished = {{IVOA} Working draft},
-    month =        mar,
-    year =         2016,
-    url =          {http://www.ivoa.net/documents/ObsCore/}
-}

Modified: trunk/projects/dm/provenance/description/usecases-implementations.tex
==============================================================================
--- trunk/projects/dm/provenance/description/usecases-implementations.tex	Sat Oct 15 00:37:56 2016	(r3627)
+++ trunk/projects/dm/provenance/description/usecases-implementations.tex	Sat Oct 15 00:38:49 2016	(r3628)
@@ -55,7 +55,7 @@
\centering
\includegraphics[width=0.9\textwidth]{usecase_Pollux_example1.png}
\caption{Pollux Example 1}
-\label{fig:archdiag}
+\label{fig:pollux}
\end{figure}

\subsection{HIPS use case}