# # Author: S. Van Hoey # Contributors: John Wieczorek # # Build script for tdwg dwc handling # __version__ = '2023-09-14T-03:00' import io import os import re import csv import sys import codecs from urllib import request from jinja2 import FileSystemLoader, Environment NAMESPACES = { 'http://rs.tdwg.org/dwc/iri/' : 'dwciri', 'http://rs.tdwg.org/dwc/terms/' : 'dwc', 'http://rs.tdwg.org/chrono/terms/' : 'chrono', 'http://purl.org/dc/elements/1.1/' : 'dc', 'http://purl.org/dc/terms/' : 'dcterms', 'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'} class ProvidedTermsError(Exception): """inconsistency in the available terms Error""" pass class RdfTypeError(Exception): """rdftype encountered that is not known by builder""" pass class DwcNamespaceError(Exception): """Namespace link is not available in the currently provided links""" pass class DwcBuildReader(): def __init__(self, dwc_build_file): """Custom Reader switching between raw Github or local file""" self.dwc_build_file = dwc_build_file def __enter__(self): if "https://raw.github" in self.dwc_build_file: self.open_dwc_term = request.urlopen(self.dwc_build_file) else: self.open_dwc_term = open(self.dwc_build_file, 'rb') return self.open_dwc_term def __exit__(self, *args): self.open_dwc_term.close() class DwcDigester(object): def __init__(self, term_versions): """Digest the term document of Darwin Core to support automatic generation of derivatives Parameters ----------- term_versions : str Either a relative path and filename of the normative Dwc document or a URL link to the raw Github version of the file Notes ----- Remark that the sequence of the term versions entries is essential for the automatic generation of the individual documents (mainly the index.html) """ self.term_versions = term_versions self.term_versions_data = {} self._store_versions() # create the defined data-object for the different outputs self.template_data = self.process_terms() def versions(self): """Iterator providing the terms as represented in the normative term versions file """ with DwcBuildReader(self.term_versions) as versions: for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','): if vterm["status"] == "recommended": yield vterm def _store_versions(self): """Collect all the versions data in a dictionary as the term_versions_data attribute """ for term in self.versions(): self.term_versions_data[term["term_iri"]] = term @property def _version_terms(self): """Get an overview of the terms in the term_versions file """ return set(self.term_versions_data.keys()) def _select_versions_term(self, term_iri): """Select a specific term of the versions data, using term_iri match """ return self.term_versions_data[term_iri] @staticmethod def split_iri(term_iri): """Split an iri field into the namespace url and the local name of the term """ prog = re.compile("(.*/)([^/]*$)") namespace, local_name = prog.findall(term_iri)[0] return namespace, local_name @staticmethod def resolve_namespace_abbrev(namespace): """Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link Parameters ----------- namespace : str valid key of the NAMESPACES variable """ if namespace not in NAMESPACES.keys(): raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES") return NAMESPACES[namespace] def get_term_definition(self, term_iri): """Extract the required information from the terms table to show on the webpage of a single term by using the term_iri as the identifier Notes ------ Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement) """ vs_term = self._select_versions_term(term_iri) term_data = {} term_data["label"] = vs_term['term_localName'] # See https://github.com/tdwg/dwc/issues/253#issuecomment-670098202 term_data["iri"] = term_iri term_data["class"] = vs_term['organized_in'] term_data["definition"] = self.convert_link(vs_term['definition']) term_data["comments"] = self.convert_link(self.convert_code(vs_term['comments'])) term_data["examples"] = self.convert_link(self.convert_code(vs_term['examples'])) term_data["rdf_type"] = vs_term['rdf_type'] namespace_url, _ = self.split_iri(term_iri) term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url) return term_data @staticmethod def convert_code(text_with_backticks): """Takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ... """ return re.sub(r'`([^`]*)`', r'\1', text_with_backticks) @staticmethod def convert_link(text_with_urls): """Takes all links in a text field and converts it to the html tagged version of the link """ def _handle_matched(inputstring): """quick hack version of url handling on the current prime versions data""" url = inputstring.group() return "{}".format(url, url) regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?