# # S. Van Hoey # # Build script for tdwg dwc handling # import io import os import re import csv import sys import codecs from urllib import request from jinja2 import FileSystemLoader, Environment NAMESPACES = { 'http://rs.tdwg.org/dwc/iri/' : 'dwciri', 'http://rs.tdwg.org/dwc/terms/' : 'dwc', 'http://purl.org/dc/elements/1.1/' : 'dc', 'http://purl.org/dc/terms/' : 'dcterms', 'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'} class ProvidedTermsError(Exception): """inconsistency in the available terms Error""" pass class RdfTypeError(Exception): """rdftype encountered that is not known by builder""" pass class DwcNamespaceError(Exception): """Namespace link is not available in the currently provided links""" pass class DwcBuildReader(): def __init__(self, dwc_build_file): """Custom Reader switching between to raw Github or local file""" self.dwc_build_file = dwc_build_file def __enter__(self): if "https://raw.github" in self.dwc_build_file: self.open_dwc_term = request.urlopen(self.dwc_build_file) else: self.open_dwc_term = open(self.dwc_build_file, 'rb') return self.open_dwc_term def __exit__(self, *args): self.open_dwc_term.close() class DwcDigester(object): def __init__(self, term_versions, terms_config): """digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives Parameters ----------- term_versions : str either a relative path and filename of the normative Dwc document or a URL link to the raw Github version of the file terms_config : str either a relative path and filename of the configurations file or a URL link to the raw Github version of the file Notes ----- Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html) """ self.term_versions = term_versions self.terms_config = terms_config self.term_versions_data = {} self._store_versions() self.terms_config_data = {} self._store_configs() # check for the ability to combine the data self.match_error_report() # create the defined data-object for the different outputs self.template_data = self.process_terms() def versions(self): """iterator providing the terms as represented in the normative term versions file""" with DwcBuildReader(self.term_versions) as versions: for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','): if vterm["status"] == "recommended": yield vterm def configs(self): """iterator providing the terms as represented in the terms config file (taking into account the sequence)""" with DwcBuildReader(self.terms_config) as configs: for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','): yield cfterm def _store_versions(self): """collect all the versions data in a dictionary as the term_versions_data attribute""" for term in self.versions(): self.term_versions_data[term["term_iri"]] = term def _store_configs(self): """collect all the config data in a dictionary as the terms_config_data attribute""" for term in self.configs(): self.terms_config_data[term["term_iri"]] = term @property def _version_terms(self): """get an overview of the terms in the term_versions file""" return set(self.term_versions_data.keys()) @property def _config_terms(self): """get an overview of the terms in the terms config file""" return set(self.terms_config_data.keys()) def _select_versions_term(self, term_iri): """select a specific term of the versions data, using term_iri match""" return self.term_versions_data[term_iri] def _select_config_term(self, term_iri): """select a specific term of the config data, using term_iri match""" return self.terms_config_data[term_iri] def match_error_report(self): """check if the prime dwc file and the configurations file provide corresponding terms and inform user on the term differences in between both files""" overload_versionterms = self._version_terms - self._config_terms overload_configterms = self._config_terms - self._version_terms if len(overload_versionterms) > 0 or len(overload_configterms) > 0: vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms]) cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms]) raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms, ". Terms only in terms_config.csv: ", cf_terms])) @staticmethod def split_iri(term_iri): """split an iri field into the namespace url and the local name of the term""" prog = re.compile("(.*/)([^/]*$)") namespace, local_name = prog.findall(term_iri)[0] return namespace, local_name @staticmethod def resolve_namespace_abbrev(namespace): """Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link""" if namespace not in NAMESPACES.keys(): raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES") return NAMESPACES[namespace] def get_term_definition(self, term_iri): """Extract the required information from both tables to show on the webpage of a single term by using the term_iri as the identifier Notes ------ Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement) """ cf_term = self._select_config_term(term_iri) vs_term = self._select_versions_term(term_iri) term_data = {} term_data["label"] = vs_term['label'] term_data["iri"] = term_iri term_data["class"] = cf_term['organized_in'] term_data["definition"] = self.convert_link(vs_term['definition']) term_data["comments"] = self.convert_link(self.convert_code(cf_term['comments'])) term_data["rdf_type"] = vs_term['rdf_type'] namespace_url, _ = self.split_iri(term_iri) term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url) return term_data @staticmethod def convert_code(text_with_backticks): """takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ... """ return re.sub(r'`([^`]*)`', r'\1', text_with_backticks) @staticmethod def convert_link(text_with_urls): """takes all links in a text field and converts it to the html tagged version of the link """ def _handle_matched(inputstring): """quick hack version of url handling on the current prime versions data""" url = inputstring.group() return "{}".format(url, url) regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?