2017-09-30 04:44:59 +00:00
#
# S. Van Hoey
#
# Build script for tdwg dwc handling
#
import io
2017-12-06 19:20:31 +00:00
import os
2017-10-01 14:49:12 +00:00
import re
2017-09-30 04:44:59 +00:00
import csv
2017-09-30 15:13:14 +00:00
import sys
2017-10-01 01:55:07 +00:00
import codecs
2017-09-30 04:44:59 +00:00
from urllib import request
2017-12-06 19:20:31 +00:00
from jinja2 import FileSystemLoader , Environment
2017-09-30 14:49:25 +00:00
2017-10-01 14:49:12 +00:00
NAMESPACES = {
' http://rs.tdwg.org/dwc/iri/ ' : ' dwciri ' ,
' http://rs.tdwg.org/dwc/terms/ ' : ' dwc ' ,
' http://purl.org/dc/elements/1.1/ ' : ' dc ' ,
' http://purl.org/dc/terms/ ' : ' dcterms ' ,
' http://rs.tdwg.org/dwc/terms/attributes/ ' : ' tdwgutility ' }
2017-09-30 04:44:59 +00:00
class ProvidedTermsError ( Exception ) :
2017-10-01 14:08:14 +00:00
""" inconsistency in the available terms Error """
2017-09-30 04:44:59 +00:00
pass
2017-10-01 14:08:14 +00:00
2017-09-30 14:48:27 +00:00
class RdfTypeError ( Exception ) :
2017-10-01 14:08:14 +00:00
""" rdftype encountered that is not known by builder """
2017-09-30 14:48:27 +00:00
pass
2017-10-01 14:49:12 +00:00
class DwcNamespaceError ( Exception ) :
""" Namespace link is not available in the currently provided links """
pass
2017-09-30 04:44:59 +00:00
class DwcBuildReader ( ) :
2017-09-30 14:48:27 +00:00
2017-09-30 04:44:59 +00:00
def __init__ ( self , dwc_build_file ) :
""" Custom Reader switching between to raw Github or local file """
self . dwc_build_file = dwc_build_file
2017-09-30 14:48:27 +00:00
2017-09-30 04:44:59 +00:00
def __enter__ ( self ) :
if " https://raw.github " in self . dwc_build_file :
self . open_dwc_term = request . urlopen ( self . dwc_build_file )
else :
self . open_dwc_term = open ( self . dwc_build_file , ' rb ' )
return self . open_dwc_term
def __exit__ ( self , * args ) :
self . open_dwc_term . close ( )
class DwcDigester ( object ) :
2017-09-30 14:48:27 +00:00
2017-09-30 04:44:59 +00:00
def __init__ ( self , term_versions , terms_config ) :
2017-10-01 14:08:14 +00:00
""" digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
Parameters
- - - - - - - - - - -
term_versions : str
either a relative path and filename of the normative Dwc document or a URL link to the
raw Github version of the file
terms_config : str
either a relative path and filename of the configurations file or a URL link to the
raw Github version of the file
Notes
- - - - -
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents ( mainly the index . html )
"""
2017-09-30 14:48:27 +00:00
self . term_versions = term_versions
self . terms_config = terms_config
self . term_versions_data = { }
self . _store_versions ( )
self . terms_config_data = { }
2017-10-01 14:57:04 +00:00
self . _store_configs ( )
2017-09-30 14:48:27 +00:00
# check for the ability to combine the data
self . match_error_report ( )
2017-10-01 15:26:07 +00:00
# create the defined data-object for the different outputs
self . template_data = self . process_terms ( )
2017-09-30 04:44:59 +00:00
def versions ( self ) :
2017-10-01 14:57:04 +00:00
""" iterator providing the terms as represented in the normative term versions file """
2017-09-30 04:44:59 +00:00
with DwcBuildReader ( self . term_versions ) as versions :
for vterm in csv . DictReader ( io . TextIOWrapper ( versions ) , delimiter = ' , ' ) :
if vterm [ " status " ] == " recommended " :
yield vterm
2017-10-01 14:57:04 +00:00
def configs ( self ) :
""" iterator providing the terms as represented in the terms config file
2017-10-01 14:08:14 +00:00
( taking into account the sequence ) """
2017-09-30 04:44:59 +00:00
with DwcBuildReader ( self . terms_config ) as configs :
for cfterm in csv . DictReader ( io . TextIOWrapper ( configs ) , delimiter = ' , ' ) :
yield cfterm
2017-09-30 14:48:27 +00:00
def _store_versions ( self ) :
2017-10-01 14:08:14 +00:00
""" collect all the versions data in a dictionary as the term_versions_data attribute """
2017-09-30 14:48:27 +00:00
for term in self . versions ( ) :
self . term_versions_data [ term [ " term_iri " ] ] = term
2017-10-01 14:57:04 +00:00
def _store_configs ( self ) :
2017-10-01 14:08:14 +00:00
""" collect all the config data in a dictionary as the terms_config_data attribute """
2017-10-01 14:57:04 +00:00
for term in self . configs ( ) :
2017-09-30 14:48:27 +00:00
self . terms_config_data [ term [ " term_iri " ] ] = term
2017-10-01 14:58:00 +00:00
@property
2017-09-30 04:44:59 +00:00
def _version_terms ( self ) :
""" get an overview of the terms in the term_versions file """
2017-09-30 14:48:27 +00:00
return set ( self . term_versions_data . keys ( ) )
2017-10-01 14:58:00 +00:00
@property
2017-09-30 04:44:59 +00:00
def _config_terms ( self ) :
""" get an overview of the terms in the terms config file """
2017-09-30 14:48:27 +00:00
return set ( self . terms_config_data . keys ( ) )
def _select_versions_term ( self , term_iri ) :
2017-10-01 15:32:49 +00:00
""" select a specific term of the versions data, using term_iri match """
2017-09-30 14:48:27 +00:00
return self . term_versions_data [ term_iri ]
def _select_config_term ( self , term_iri ) :
2017-10-01 15:32:49 +00:00
""" select a specific term of the config data, using term_iri match """
2017-09-30 14:48:27 +00:00
return self . terms_config_data [ term_iri ]
2017-09-30 04:44:59 +00:00
def match_error_report ( self ) :
2017-10-01 15:32:49 +00:00
""" check if the prime dwc file and the configurations file provide corresponding terms and inform user on the term differences in between both files """
2017-10-01 14:58:00 +00:00
overload_versionterms = self . _version_terms - self . _config_terms
overload_configterms = self . _config_terms - self . _version_terms
2017-09-30 04:44:59 +00:00
if len ( overload_versionterms ) > 0 or len ( overload_configterms ) > 0 :
vs_terms = " , " . join ( [ term . split ( " / " ) [ - 1 ] for term in overload_versionterms ] )
cf_terms = " , " . join ( [ term . split ( " / " ) [ - 1 ] for term in overload_configterms ] )
2017-09-30 14:49:25 +00:00
raise ProvidedTermsError ( " " . join ( [ " Terms only in term_versions.csv: " , vs_terms ,
2017-09-30 04:44:59 +00:00
" . Terms only in terms_config.csv: " , cf_terms ] ) )
2017-10-01 14:49:12 +00:00
@staticmethod
def split_iri ( term_iri ) :
2017-10-12 16:26:07 +00:00
""" split an iri field into the namespace url and the local name of the term """
2017-10-01 14:49:12 +00:00
prog = re . compile ( " (.*/)([^/]*$) " )
2017-10-12 16:26:07 +00:00
namespace , local_name = prog . findall ( term_iri ) [ 0 ]
return namespace , local_name
2017-09-30 14:49:25 +00:00
2017-10-01 14:49:12 +00:00
@staticmethod
def resolve_namespace_abbrev ( namespace ) :
2017-10-01 15:32:49 +00:00
""" Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link """
2017-10-01 14:49:12 +00:00
if namespace not in NAMESPACES . keys ( ) :
raise DwcNamespaceError ( " The namespace url is currently not supported in NAMESPACES " )
return NAMESPACES [ namespace ]
2017-10-01 15:32:49 +00:00
def get_term_definition ( self , term_iri ) :
""" Extract the required information from both tables to show on the webpage of a single term
by using the term_iri as the identifier
Notes
- - - - - -
Due to the current implementation , make sure to provide the same keys represented in the record - level specific version ` process_terms ` method ( room for improvement )
2017-10-01 14:49:12 +00:00
"""
2017-10-01 15:27:49 +00:00
cf_term = self . _select_config_term ( term_iri )
vs_term = self . _select_versions_term ( term_iri )
2017-09-30 14:49:25 +00:00
term_data = { }
term_data [ " label " ] = vs_term [ ' label ' ]
2017-10-12 16:26:07 +00:00
term_data [ " iri " ] = term_iri
2017-09-30 14:49:25 +00:00
term_data [ " class " ] = cf_term [ ' organized_in ' ]
2017-10-02 20:31:33 +00:00
term_data [ " definition " ] = self . convert_link ( vs_term [ ' definition ' ] )
2017-10-03 15:37:59 +00:00
term_data [ " comments " ] = self . convert_link ( self . convert_code ( cf_term [ ' comments ' ] ) )
2017-09-30 14:49:25 +00:00
term_data [ " rdf_type " ] = vs_term [ ' rdf_type ' ]
2017-10-01 14:49:12 +00:00
namespace_url , _ = self . split_iri ( term_iri )
2017-10-01 14:58:17 +00:00
term_data [ " namespace " ] = self . resolve_namespace_abbrev ( namespace_url )
2017-09-30 14:49:25 +00:00
return term_data
2017-10-03 15:37:59 +00:00
@staticmethod
def convert_code ( text_with_backticks ) :
""" takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks <code>...</code>
"""
return re . sub ( r ' `([^`]*)` ' , r ' <code> \ 1</code> ' , text_with_backticks )
2017-10-02 20:31:33 +00:00
@staticmethod
def convert_link ( text_with_urls ) :
2017-10-02 20:36:24 +00:00
""" takes all links in a text field and converts it to the html tagged version of the link
2017-10-02 20:31:33 +00:00
"""
def _handle_matched ( inputstring ) :
""" quick hack version of url handling on the current prime versions data """
url = inputstring . group ( )
return " <a href= \" {} \" > {} </a> " . format ( url , url )
2017-10-03 15:37:59 +00:00
2017-10-03 16:53:20 +00:00
regx = " (http[s]?://[ \ w \ d:#@ % /;$()~_? \ +-;= \\ \ .&]*)(?<![ \ ) \ .]) "
2017-10-02 20:31:33 +00:00
return re . sub ( regx , _handle_matched , text_with_urls )
2017-09-30 04:44:59 +00:00
def process_terms ( self ) :
2017-10-01 15:32:49 +00:00
""" parse the config terms (sequence matters!), collect all required data from both the normative versions file and the config file and return the template ready data.
Returns
- - - - - - -
2017-10-06 23:08:21 +00:00
Data object that can be digested by the html - template file . Contains the term data formatted to create the indidivual outputs , each list element is a dictionary representing a class group . Hence , the data object is structured as follows :
2017-09-30 14:49:25 +00:00
2017-10-01 15:32:49 +00:00
[
{ ' name ' : class_group_name_1 , ' label ' : xxxx , . . . ,
' terms ' :
[
{ ' name ' : term_1 , ' label ' : xxxx , . . . } ,
{ ' name ' : term_2 , ' label ' : xxxx , . . . } ,
. . .
] }
{ ' name ' : class_group_name_2 , . . .
. . . } ,
. . .
]
"""
2017-09-30 14:49:25 +00:00
template_data = [ ]
in_class = " Record-level "
2017-10-01 15:32:49 +00:00
# sequence matters in config and it starts with Record-level which we populate here ad-hoc
2017-09-30 14:49:25 +00:00
class_group = { }
2017-09-30 20:34:56 +00:00
class_group [ " label " ] = " Record-level "
2017-10-12 16:26:07 +00:00
class_group [ " iri " ] = None
2017-09-30 20:34:56 +00:00
class_group [ " class " ] = None
class_group [ " definition " ] = None
class_group [ " comments " ] = None
class_group [ " rdf_type " ] = None
2017-09-30 14:49:25 +00:00
class_group [ " terms " ] = [ ]
2017-10-06 23:08:42 +00:00
class_group [ " namespace " ] = None
2017-10-01 14:58:17 +00:00
for term in self . configs ( ) : # sequence of the config file used as order
2017-10-01 15:27:49 +00:00
term_data = self . get_term_definition ( term [ ' term_iri ' ] )
2017-09-30 14:49:25 +00:00
# new class encountered
if term_data [ " rdf_type " ] == " http://www.w3.org/2000/01/rdf-schema#Class " :
# store previous section in template_data
template_data . append ( class_group )
#start new class group
class_group = term_data
class_group [ " terms " ] = [ ]
in_class = term_data [ " label " ] # check on the class working in
else :
class_group [ ' terms ' ] . append ( term_data )
# save the last class to template_data
template_data . append ( class_group )
return template_data
2017-10-01 15:32:49 +00:00
def create_html ( self , html_template = " ./config/index.tmpl " ,
2017-10-02 15:56:49 +00:00
html_output = " ../www/guides/index.html " ) :
2017-10-01 14:08:14 +00:00
""" build html with the processed term info, by filling in the tmpl-template
Parameters
- - - - - - - - - - -
html_template : str
2017-12-06 19:20:31 +00:00
relative path and filename to the Jinja2 compatible
2017-10-01 14:08:14 +00:00
template
html_output : str
relative path and filename to write the resulting index . html
"""
2017-09-30 15:13:14 +00:00
2017-10-01 15:26:07 +00:00
data = { }
2017-10-12 16:28:17 +00:00
data [ " class_groups " ] = self . template_data
2017-12-06 19:20:31 +00:00
env = Environment ( loader = FileSystemLoader ( os . path . dirname ( html_template ) ) )
template = env . get_template ( os . path . basename ( html_template ) )
html = template . render ( data )
2017-09-30 14:49:25 +00:00
index_page = open ( html_output , " w " )
index_page . write ( str ( html ) )
index_page . close ( )
2017-09-30 15:13:14 +00:00
2017-10-01 15:26:07 +00:00
def simple_dwc_terms ( self ) :
2017-10-01 15:32:49 +00:00
""" only extract those terms that are simple dwc, defined as `simple` in the flags column of the config file of terms """
2017-10-01 01:55:07 +00:00
properties = [ ]
2017-10-01 14:57:04 +00:00
for term in self . configs ( ) :
2017-10-01 15:26:07 +00:00
term_data = self . get_term_definition ( term [ ' term_iri ' ] )
2017-10-01 14:08:14 +00:00
if ( term_data [ " rdf_type " ] == " http://www.w3.org/1999/02/22-rdf-syntax-ns#Property " and
term [ " flags " ] == " simple " ) :
2017-10-12 16:26:07 +00:00
properties . append ( term_data [ " label " ] )
2017-10-01 01:55:07 +00:00
return properties
2017-10-01 15:26:07 +00:00
def create_dwc_list ( self , file_output = " ../dist/simple_dwc_vertical.csv " ) :
2017-10-01 15:32:49 +00:00
""" build a list of simple dwc terms and write it to file
Parameters
- - - - - - - - - - -
file_output : str
relative path and filename to write the resulting list
"""
2017-10-01 01:55:07 +00:00
with codecs . open ( file_output , ' w ' , ' utf-8 ' ) as dwc_list_file :
2017-10-01 15:26:07 +00:00
for term in self . simple_dwc_terms ( ) :
2017-10-01 13:23:58 +00:00
dwc_list_file . write ( term + " \n " )
2017-10-01 01:55:07 +00:00
2017-10-01 15:26:07 +00:00
def create_dwc_header ( self , file_output = " ../dist/simple_dwc_horizontal.csv " ) :
2017-10-01 15:32:49 +00:00
""" build a header of simple dwc terms and write it to file
Parameters
- - - - - - - - - - -
file_output : str
relative path and filename to write the resulting list
"""
2017-10-01 01:55:07 +00:00
with codecs . open ( file_output , ' w ' , ' utf-8 ' ) as dwc_header_file :
2017-10-01 15:26:07 +00:00
properties = self . simple_dwc_terms ( )
2017-10-01 01:55:07 +00:00
dwc_header_file . write ( " , " . join ( properties ) )
dwc_header_file . write ( " \n " )
2017-09-30 15:13:14 +00:00
def main ( ) :
2017-10-01 14:08:14 +00:00
""" Building up the quick reference html and derivatives """
2017-09-30 15:13:14 +00:00
2017-09-30 20:27:54 +00:00
config_terms_file = " ./config/terms.csv "
2017-10-02 15:49:03 +00:00
term_versions_file = " ../standard/vocabularies/term_versions.csv "
2017-09-30 15:13:14 +00:00
2017-10-06 23:08:21 +00:00
print ( " Running build process: " )
2017-09-30 15:13:14 +00:00
my_dwc = DwcDigester ( term_versions_file , config_terms_file )
2017-10-06 23:08:21 +00:00
print ( " Building quick reference guide " )
2017-10-01 15:26:07 +00:00
my_dwc . create_html ( )
2017-10-06 23:08:21 +00:00
print ( " Building simple DwC CSV files " )
2017-10-01 15:26:07 +00:00
my_dwc . create_dwc_list ( )
my_dwc . create_dwc_header ( )
2017-10-06 23:08:21 +00:00
print ( " Done! " )
2017-09-30 15:13:14 +00:00
if __name__ == " __main__ " :
2017-09-30 16:14:12 +00:00
sys . exit ( main ( ) )