dwc/build/build.py

284 lines
11 KiB
Python
Raw Normal View History

#
# S. Van Hoey
#
# Build script for tdwg dwc handling
#
import io
import re
import csv
2017-09-30 15:13:14 +00:00
import sys
2017-10-01 01:55:07 +00:00
import codecs
from urllib import request
2017-09-30 14:49:25 +00:00
from Cheetah.Template import Template
NAMESPACES = {
'http://rs.tdwg.org/dwc/iri/' : 'dwciri',
'http://rs.tdwg.org/dwc/terms/' : 'dwc',
'http://purl.org/dc/elements/1.1/' : 'dc',
'http://purl.org/dc/terms/' : 'dcterms',
'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'}
class ProvidedTermsError(Exception):
2017-10-01 14:08:14 +00:00
"""inconsistency in the available terms Error"""
pass
2017-10-01 14:08:14 +00:00
2017-09-30 14:48:27 +00:00
class RdfTypeError(Exception):
2017-10-01 14:08:14 +00:00
"""rdftype encountered that is not known by builder"""
2017-09-30 14:48:27 +00:00
pass
class DwcNamespaceError(Exception):
"""Namespace link is not available in the currently provided links"""
pass
class DwcBuildReader():
2017-09-30 14:48:27 +00:00
def __init__(self, dwc_build_file):
"""Custom Reader switching between to raw Github or local file"""
self.dwc_build_file = dwc_build_file
2017-09-30 14:48:27 +00:00
def __enter__(self):
if "https://raw.github" in self.dwc_build_file:
self.open_dwc_term = request.urlopen(self.dwc_build_file)
else:
self.open_dwc_term = open(self.dwc_build_file, 'rb')
return self.open_dwc_term
def __exit__(self, *args):
self.open_dwc_term.close()
class DwcDigester(object):
2017-09-30 14:48:27 +00:00
def __init__(self, term_versions, terms_config):
2017-10-01 14:08:14 +00:00
"""digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
Parameters
-----------
term_versions : str
either a relative path and filename of the normative Dwc document or a URL link to the
raw Github version of the file
terms_config : str
either a relative path and filename of the configurations file or a URL link to the
raw Github version of the file
Notes
-----
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html)
"""
2017-09-30 14:48:27 +00:00
self.term_versions = term_versions
self.terms_config = terms_config
self.term_versions_data = {}
self._store_versions()
self.terms_config_data = {}
2017-10-01 14:57:04 +00:00
self._store_configs()
2017-09-30 14:48:27 +00:00
# check for the ability to combine the data
self.match_error_report()
# create the defined data-object for the different outputs
self.template_data = self.process_terms()
def versions(self):
2017-10-01 14:57:04 +00:00
"""iterator providing the terms as represented in the normative term versions file"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
2017-10-01 14:57:04 +00:00
def configs(self):
"""iterator providing the terms as represented in the terms config file
2017-10-01 14:08:14 +00:00
(taking into account the sequence)"""
with DwcBuildReader(self.terms_config) as configs:
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
yield cfterm
2017-09-30 14:48:27 +00:00
def _store_versions(self):
2017-10-01 14:08:14 +00:00
"""collect all the versions data in a dictionary as the term_versions_data attribute"""
2017-09-30 14:48:27 +00:00
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
2017-10-01 14:57:04 +00:00
def _store_configs(self):
2017-10-01 14:08:14 +00:00
"""collect all the config data in a dictionary as the terms_config_data attribute"""
2017-10-01 14:57:04 +00:00
for term in self.configs():
2017-09-30 14:48:27 +00:00
self.terms_config_data[term["term_iri"]] = term
2017-10-01 14:58:00 +00:00
@property
def _version_terms(self):
"""get an overview of the terms in the term_versions file"""
2017-09-30 14:48:27 +00:00
return set(self.term_versions_data.keys())
2017-10-01 14:58:00 +00:00
@property
def _config_terms(self):
"""get an overview of the terms in the terms config file"""
2017-09-30 14:48:27 +00:00
return set(self.terms_config_data.keys())
def _select_versions_term(self, term_iri):
"""Select a specific term of the versions data, using term_iri match"""
return self.term_versions_data[term_iri]
def _select_config_term(self, term_iri):
"""Select a specific term of the config data, using term_iri match"""
return self.terms_config_data[term_iri]
def match_error_report(self):
"""check if the prime dwc file and the humaninfo file provide corresponding terms"""
2017-10-01 14:58:00 +00:00
overload_versionterms = self._version_terms - self._config_terms
overload_configterms = self._config_terms - self._version_terms
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
2017-09-30 14:49:25 +00:00
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
". Terms only in terms_config.csv: ", cf_terms]))
@staticmethod
def split_iri(term_iri):
""""""
prog = re.compile("(.*/)([^/]*$)")
namespace, term = prog.findall(term_iri)[0]
return namespace, term
@staticmethod
def resolve_namespace_abbrev(namespace):
"""Using the NAMESPACE constant, get the namespace abbreviation"""
if namespace not in NAMESPACES.keys():
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
return NAMESPACES[namespace]
def get_term_definition(self, config_term):
"""Extract the required information to show on the webpage of a single term
requires configuration term
"""
cf_term = self._select_config_term(term_iri)
vs_term = self._select_versions_term(term_iri)
2017-09-30 14:49:25 +00:00
term_data = {}
_, term_data["name"] = self.split_iri(term_iri)
term_data["iri"] = term_iri
2017-09-30 14:49:25 +00:00
term_data["label"] = vs_term['label']
term_data["class"] = cf_term['organized_in']
term_data["definition"] = vs_term['definition']
term_data["comments"] = cf_term['comments']
2017-09-30 14:49:25 +00:00
term_data["rdf_type"] = vs_term['rdf_type']
namespace_url, _ = self.split_iri(term_iri)
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
2017-09-30 14:49:25 +00:00
return term_data
def process_terms(self):
"""parse the config terms towards the structure required for the HTML template"""
2017-09-30 14:49:25 +00:00
template_data = []
in_class = "Record-level"
# sequence matters in config and it starts with Record-level
class_group = {}
class_group["name"] = "Record-level"
class_group["iri"] = None
class_group["label"] = "Record-level"
class_group["class"] = None
class_group["definition"] = None
class_group["comments"] = None
class_group["rdf_type"] = None
2017-09-30 14:49:25 +00:00
class_group["terms"] = []
class_group["namespace"] = "Record-level"
for term in self.configs(): # sequence of the config file used as order
term_data = self.get_term_definition(term['term_iri'])
2017-09-30 14:49:25 +00:00
# new class encountered
if term_data["rdf_type"] == "http://www.w3.org/2000/01/rdf-schema#Class":
# store previous section in template_data
template_data.append(class_group)
#start new class group
class_group = term_data
class_group["terms"] = []
in_class = term_data["label"] # check on the class working in
else:
class_group['terms'].append(term_data)
# save the last class to template_data
template_data.append(class_group)
return template_data
@staticmethod
2017-10-01 14:08:14 +00:00
def create_html(template_data, html_template="./config/index.tmpl",
2017-10-01 13:23:58 +00:00
html_output="../guides/index.html"):
2017-10-01 14:08:14 +00:00
"""build html with the processed term info, by filling in the tmpl-template
Parameters
-----------
template_data : list of dict
contains the term data formatted to create the indidivual outputs, each list element
is a dictionary representing a class group. Hence, the data object is structured as
follows
[
{'name' : class_group_name_1, 'label': xxxx,...,
'terms':
[
{'name' : term_1, 'label': xxxx,...},
{'name' : term_2, 'label': xxxx,...},
...
]}
{'name' : class_group_name_2,...
...},
...
]
html_template : str
relative path and filename to the [Cheetah3](http://cheetahtemplate.org/) compatible
template
html_output : str
relative path and filename to write the resulting index.html
"""
2017-09-30 15:13:14 +00:00
data = {}
data["groups"] = self.template_data
2017-09-30 14:49:25 +00:00
html = Template(file=html_template, searchList=[data])
index_page = open(html_output, "w")
index_page.write(str(html))
index_page.close()
2017-09-30 15:13:14 +00:00
def simple_dwc_terms(self):
"""only extract those terms that are simple dwc,
2017-10-01 14:08:14 +00:00
defined as `simple` in the flags column of the config file of terms"""
2017-10-01 01:55:07 +00:00
properties = []
2017-10-01 14:57:04 +00:00
for term in self.configs():
term_data = self.get_term_definition(term['term_iri'])
2017-10-01 14:08:14 +00:00
if (term_data["rdf_type"] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property" and
term["flags"] == "simple"):
2017-10-01 13:23:58 +00:00
properties.append(term_data["name"])
2017-10-01 01:55:07 +00:00
return properties
def create_dwc_list(self, file_output="../dist/simple_dwc_vertical.csv"):
2017-10-01 14:08:14 +00:00
"""build a list of simple dwc terms"""
2017-10-01 01:55:07 +00:00
with codecs.open(file_output, 'w', 'utf-8') as dwc_list_file:
for term in self.simple_dwc_terms():
2017-10-01 13:23:58 +00:00
dwc_list_file.write(term + "\n")
2017-10-01 01:55:07 +00:00
def create_dwc_header(self, file_output="../dist/simple_dwc_horizontal.csv"):
2017-10-01 14:08:14 +00:00
"""build a header of simple dwc terms"""
2017-10-01 01:55:07 +00:00
with codecs.open(file_output, 'w', 'utf-8') as dwc_header_file:
properties = self.simple_dwc_terms()
2017-10-01 01:55:07 +00:00
dwc_header_file.write(",".join(properties))
dwc_header_file.write("\n")
2017-09-30 15:13:14 +00:00
def main():
2017-10-01 14:08:14 +00:00
"""Building up the quick reference html and derivatives"""
2017-09-30 15:13:14 +00:00
2017-09-30 20:27:54 +00:00
config_terms_file = "./config/terms.csv"
2017-09-30 15:13:14 +00:00
term_versions_file = "../vocabulary/term_versions.csv"
print("Running build process using current term_versions and config_terms file...")
my_dwc = DwcDigester(term_versions_file, config_terms_file)
print("Building index html file...")
my_dwc.create_html()
2017-10-01 01:55:07 +00:00
print("Building simple dwc list and header...")
my_dwc.create_dwc_list()
my_dwc.create_dwc_header()
2017-09-30 15:13:14 +00:00
print("...done!")
if __name__ == "__main__":
sys.exit(main())