
284 lines
11 KiB
Raw Normal View History

# S. Van Hoey
# Build script for tdwg dwc handling
import io
import re
import csv
2017-09-30 15:13:14 +00:00
import sys
2017-10-01 01:55:07 +00:00
import codecs
from urllib import request
2017-09-30 14:49:25 +00:00
from Cheetah.Template import Template
'http://rs.tdwg.org/dwc/iri/' : 'dwciri',
'http://rs.tdwg.org/dwc/terms/' : 'dwc',
'http://purl.org/dc/elements/1.1/' : 'dc',
'http://purl.org/dc/terms/' : 'dcterms',
'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'}
class ProvidedTermsError(Exception):
2017-10-01 14:08:14 +00:00
"""inconsistency in the available terms Error"""
2017-10-01 14:08:14 +00:00
2017-09-30 14:48:27 +00:00
class RdfTypeError(Exception):
2017-10-01 14:08:14 +00:00
"""rdftype encountered that is not known by builder"""
2017-09-30 14:48:27 +00:00
class DwcNamespaceError(Exception):
"""Namespace link is not available in the currently provided links"""
class DwcBuildReader():
2017-09-30 14:48:27 +00:00
def __init__(self, dwc_build_file):
"""Custom Reader switching between to raw Github or local file"""
self.dwc_build_file = dwc_build_file
2017-09-30 14:48:27 +00:00
def __enter__(self):
if "https://raw.github" in self.dwc_build_file:
self.open_dwc_term = request.urlopen(self.dwc_build_file)
self.open_dwc_term = open(self.dwc_build_file, 'rb')
return self.open_dwc_term
def __exit__(self, *args):
class DwcDigester(object):
2017-09-30 14:48:27 +00:00
def __init__(self, term_versions, terms_config):
2017-10-01 14:08:14 +00:00
"""digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
term_versions : str
either a relative path and filename of the normative Dwc document or a URL link to the
raw Github version of the file
terms_config : str
either a relative path and filename of the configurations file or a URL link to the
raw Github version of the file
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html)
2017-09-30 14:48:27 +00:00
self.term_versions = term_versions
self.terms_config = terms_config
self.term_versions_data = {}
self.terms_config_data = {}
2017-10-01 14:57:04 +00:00
2017-09-30 14:48:27 +00:00
# check for the ability to combine the data
# create the defined data-object for the different outputs
self.template_data = self.process_terms()
def versions(self):
2017-10-01 14:57:04 +00:00
"""iterator providing the terms as represented in the normative term versions file"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
2017-10-01 14:57:04 +00:00
def configs(self):
"""iterator providing the terms as represented in the terms config file
2017-10-01 14:08:14 +00:00
(taking into account the sequence)"""
with DwcBuildReader(self.terms_config) as configs:
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
yield cfterm
2017-09-30 14:48:27 +00:00
def _store_versions(self):
2017-10-01 14:08:14 +00:00
"""collect all the versions data in a dictionary as the term_versions_data attribute"""
2017-09-30 14:48:27 +00:00
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
2017-10-01 14:57:04 +00:00
def _store_configs(self):
2017-10-01 14:08:14 +00:00
"""collect all the config data in a dictionary as the terms_config_data attribute"""
2017-10-01 14:57:04 +00:00
for term in self.configs():
2017-09-30 14:48:27 +00:00
self.terms_config_data[term["term_iri"]] = term
2017-10-01 14:58:00 +00:00
def _version_terms(self):
"""get an overview of the terms in the term_versions file"""
2017-09-30 14:48:27 +00:00
return set(self.term_versions_data.keys())
2017-10-01 14:58:00 +00:00
def _config_terms(self):
"""get an overview of the terms in the terms config file"""
2017-09-30 14:48:27 +00:00
return set(self.terms_config_data.keys())
def _select_versions_term(self, term_iri):
"""Select a specific term of the versions data, using term_iri match"""
return self.term_versions_data[term_iri]
def _select_config_term(self, term_iri):
"""Select a specific term of the config data, using term_iri match"""
return self.terms_config_data[term_iri]
def match_error_report(self):
"""check if the prime dwc file and the humaninfo file provide corresponding terms"""
2017-10-01 14:58:00 +00:00
overload_versionterms = self._version_terms - self._config_terms
overload_configterms = self._config_terms - self._version_terms
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
2017-09-30 14:49:25 +00:00
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
". Terms only in terms_config.csv: ", cf_terms]))
def split_iri(term_iri):
prog = re.compile("(.*/)([^/]*$)")
namespace, term = prog.findall(term_iri)[0]
return namespace, term
def resolve_namespace_abbrev(namespace):
"""Using the NAMESPACE constant, get the namespace abbreviation"""
if namespace not in NAMESPACES.keys():
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
return NAMESPACES[namespace]
def get_term_definition(self, config_term):
"""Extract the required information to show on the webpage of a single term
requires configuration term
cf_term = self._select_config_term(term_iri)
vs_term = self._select_versions_term(term_iri)
2017-09-30 14:49:25 +00:00
term_data = {}
_, term_data["name"] = self.split_iri(term_iri)
term_data["iri"] = term_iri
2017-09-30 14:49:25 +00:00
term_data["label"] = vs_term['label']
term_data["class"] = cf_term['organized_in']
term_data["definition"] = vs_term['definition']
term_data["comments"] = cf_term['comments']
2017-09-30 14:49:25 +00:00
term_data["rdf_type"] = vs_term['rdf_type']
namespace_url, _ = self.split_iri(term_iri)
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
2017-09-30 14:49:25 +00:00
return term_data
def process_terms(self):
"""parse the config terms towards the structure required for the HTML template"""
2017-09-30 14:49:25 +00:00
template_data = []
in_class = "Record-level"
# sequence matters in config and it starts with Record-level
class_group = {}
class_group["name"] = "Record-level"
class_group["iri"] = None
class_group["label"] = "Record-level"
class_group["class"] = None
class_group["definition"] = None
class_group["comments"] = None
class_group["rdf_type"] = None
2017-09-30 14:49:25 +00:00
class_group["terms"] = []
class_group["namespace"] = "Record-level"
for term in self.configs(): # sequence of the config file used as order
term_data = self.get_term_definition(term['term_iri'])
2017-09-30 14:49:25 +00:00
# new class encountered
if term_data["rdf_type"] == "http://www.w3.org/2000/01/rdf-schema#Class":
# store previous section in template_data
#start new class group
class_group = term_data
class_group["terms"] = []
in_class = term_data["label"] # check on the class working in
# save the last class to template_data
return template_data
2017-10-01 14:08:14 +00:00
def create_html(template_data, html_template="./config/index.tmpl",
2017-10-01 13:23:58 +00:00
2017-10-01 14:08:14 +00:00
"""build html with the processed term info, by filling in the tmpl-template
template_data : list of dict
contains the term data formatted to create the indidivual outputs, each list element
is a dictionary representing a class group. Hence, the data object is structured as
{'name' : class_group_name_1, 'label': xxxx,...,
{'name' : term_1, 'label': xxxx,...},
{'name' : term_2, 'label': xxxx,...},
{'name' : class_group_name_2,...
html_template : str
relative path and filename to the [Cheetah3](http://cheetahtemplate.org/) compatible
html_output : str
relative path and filename to write the resulting index.html
2017-09-30 15:13:14 +00:00
data = {}
data["groups"] = self.template_data
2017-09-30 14:49:25 +00:00
html = Template(file=html_template, searchList=[data])
index_page = open(html_output, "w")
2017-09-30 15:13:14 +00:00
def simple_dwc_terms(self):
"""only extract those terms that are simple dwc,
2017-10-01 14:08:14 +00:00
defined as `simple` in the flags column of the config file of terms"""
2017-10-01 01:55:07 +00:00
properties = []
2017-10-01 14:57:04 +00:00
for term in self.configs():
term_data = self.get_term_definition(term['term_iri'])
2017-10-01 14:08:14 +00:00
if (term_data["rdf_type"] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property" and
term["flags"] == "simple"):
2017-10-01 13:23:58 +00:00
2017-10-01 01:55:07 +00:00
return properties
def create_dwc_list(self, file_output="../dist/simple_dwc_vertical.csv"):
2017-10-01 14:08:14 +00:00
"""build a list of simple dwc terms"""
2017-10-01 01:55:07 +00:00
with codecs.open(file_output, 'w', 'utf-8') as dwc_list_file:
for term in self.simple_dwc_terms():
2017-10-01 13:23:58 +00:00
dwc_list_file.write(term + "\n")
2017-10-01 01:55:07 +00:00
def create_dwc_header(self, file_output="../dist/simple_dwc_horizontal.csv"):
2017-10-01 14:08:14 +00:00
"""build a header of simple dwc terms"""
2017-10-01 01:55:07 +00:00
with codecs.open(file_output, 'w', 'utf-8') as dwc_header_file:
properties = self.simple_dwc_terms()
2017-10-01 01:55:07 +00:00
2017-09-30 15:13:14 +00:00
def main():
2017-10-01 14:08:14 +00:00
"""Building up the quick reference html and derivatives"""
2017-09-30 15:13:14 +00:00
2017-09-30 20:27:54 +00:00
config_terms_file = "./config/terms.csv"
2017-09-30 15:13:14 +00:00
term_versions_file = "../vocabulary/term_versions.csv"
print("Running build process using current term_versions and config_terms file...")
my_dwc = DwcDigester(term_versions_file, config_terms_file)
print("Building index html file...")
2017-10-01 01:55:07 +00:00
print("Building simple dwc list and header...")
2017-09-30 15:13:14 +00:00
if __name__ == "__main__":