#
# S. Van Hoey
#
# Build script for tdwg dwc handling
#
import io
import re
import csv
import sys
import codecs
from urllib import request
from Cheetah.Template import Template
NAMESPACES = {
'http://rs.tdwg.org/dwc/iri/' : 'dwciri',
'http://rs.tdwg.org/dwc/terms/' : 'dwc',
'http://purl.org/dc/elements/1.1/' : 'dc',
'http://purl.org/dc/terms/' : 'dcterms',
'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'}
class ProvidedTermsError(Exception):
"""inconsistency in the available terms Error"""
pass
class RdfTypeError(Exception):
"""rdftype encountered that is not known by builder"""
pass
class DwcNamespaceError(Exception):
"""Namespace link is not available in the currently provided links"""
pass
class DwcBuildReader():
def __init__(self, dwc_build_file):
"""Custom Reader switching between to raw Github or local file"""
self.dwc_build_file = dwc_build_file
def __enter__(self):
if "https://raw.github" in self.dwc_build_file:
self.open_dwc_term = request.urlopen(self.dwc_build_file)
else:
self.open_dwc_term = open(self.dwc_build_file, 'rb')
return self.open_dwc_term
def __exit__(self, *args):
self.open_dwc_term.close()
class DwcDigester(object):
def __init__(self, term_versions, terms_config):
"""digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
Parameters
-----------
term_versions : str
either a relative path and filename of the normative Dwc document or a URL link to the
raw Github version of the file
terms_config : str
either a relative path and filename of the configurations file or a URL link to the
raw Github version of the file
Notes
-----
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html)
"""
self.term_versions = term_versions
self.terms_config = terms_config
self.term_versions_data = {}
self._store_versions()
self.terms_config_data = {}
self._store_configs()
# check for the ability to combine the data
self.match_error_report()
# create the defined data-object for the different outputs
self.template_data = self.process_terms()
def versions(self):
"""iterator providing the terms as represented in the normative term versions file"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
def configs(self):
"""iterator providing the terms as represented in the terms config file
(taking into account the sequence)"""
with DwcBuildReader(self.terms_config) as configs:
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
yield cfterm
def _store_versions(self):
"""collect all the versions data in a dictionary as the term_versions_data attribute"""
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
def _store_configs(self):
"""collect all the config data in a dictionary as the terms_config_data attribute"""
for term in self.configs():
self.terms_config_data[term["term_iri"]] = term
@property
def _version_terms(self):
"""get an overview of the terms in the term_versions file"""
return set(self.term_versions_data.keys())
@property
def _config_terms(self):
"""get an overview of the terms in the terms config file"""
return set(self.terms_config_data.keys())
def _select_versions_term(self, term_iri):
"""select a specific term of the versions data, using term_iri match"""
return self.term_versions_data[term_iri]
def _select_config_term(self, term_iri):
"""select a specific term of the config data, using term_iri match"""
return self.terms_config_data[term_iri]
def match_error_report(self):
"""check if the prime dwc file and the configurations file provide corresponding terms and inform user on the term differences in between both files"""
overload_versionterms = self._version_terms - self._config_terms
overload_configterms = self._config_terms - self._version_terms
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
". Terms only in terms_config.csv: ", cf_terms]))
@staticmethod
def split_iri(term_iri):
"""split an iri field into the namespace url and the local name of the term"""
prog = re.compile("(.*/)([^/]*$)")
namespace, local_name = prog.findall(term_iri)[0]
return namespace, local_name
@staticmethod
def resolve_namespace_abbrev(namespace):
"""Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link"""
if namespace not in NAMESPACES.keys():
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
return NAMESPACES[namespace]
def get_term_definition(self, term_iri):
"""Extract the required information from both tables to show on the webpage of a single term
by using the term_iri as the identifier
Notes
------
Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement)
"""
cf_term = self._select_config_term(term_iri)
vs_term = self._select_versions_term(term_iri)
term_data = {}
term_data["label"] = vs_term['label']
term_data["iri"] = term_iri
term_data["class"] = cf_term['organized_in']
term_data["definition"] = self.convert_link(vs_term['definition'])
term_data["comments"] = self.convert_link(self.convert_code(cf_term['comments']))
term_data["rdf_type"] = vs_term['rdf_type']
namespace_url, _ = self.split_iri(term_iri)
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
return term_data
@staticmethod
def convert_code(text_with_backticks):
"""takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ...
"""
return re.sub(r'`([^`]*)`', r'\1
', text_with_backticks)
@staticmethod
def convert_link(text_with_urls):
"""takes all links in a text field and converts it to the html tagged version of the link
"""
def _handle_matched(inputstring):
"""quick hack version of url handling on the current prime versions data"""
url = inputstring.group()
return "{}".format(url, url)
regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?