#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. __author__ = "John Wieczorek" __copyright__ = "Copyright 2021 Rauthiflor LLC" __filename__ = 'build_extension.py' __version__ = f'{__filename__} 2021-08-17T20:40-03:00' import io import os import re import csv import sys import codecs import html import argparse from urllib import request NAMESPACES = { 'http://rs.tdwg.org/dwc/iri/' : 'dwciri', 'http://rs.tdwg.org/dwc/terms/' : 'dwc', 'http://rs.tdwg.org/chrono/terms/' : 'chrono', 'http://purl.org/dc/elements/1.1/' : 'dc', 'http://purl.org/dc/terms/' : 'dcterms', 'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'} class ProvidedTermsError(Exception): """inconsistency in the available terms Error""" pass class RdfTypeError(Exception): """rdftype encountered that is not known by builder""" pass class DwcNamespaceError(Exception): """Namespace link is not available in the currently provided links""" pass class DwcBuildReader(): def __init__(self, dwc_build_file): """Custom Reader switching between raw Github or local file""" self.dwc_build_file = dwc_build_file def __enter__(self): if "https://raw.github" in self.dwc_build_file: self.open_dwc_term = request.urlopen(self.dwc_build_file) else: self.open_dwc_term = open(self.dwc_build_file, 'rb') return self.open_dwc_term def __exit__(self, *args): self.open_dwc_term.close() class DwcDigester(object): def __init__(self, term_versions): """Digest the term document of Darwin Core to support automatic generation of derivatives Parameters ----------- term_versions : str Either a relative path and filename of the normative Dwc document or a URL link to the raw Github version of the file Notes ----- Remark that the sequence of the term versions entries is essential for the automatic generation of the individual documents (mainly the index.html) """ self.term_versions = term_versions self.term_versions_data = {} self._store_versions() # create the defined data-object for the different outputs self.template_data = self.process_terms() def versions(self): """Iterator providing the terms as represented in the normative term versions file """ with DwcBuildReader(self.term_versions) as versions: for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','): if vterm["status"] == "recommended": yield vterm def _store_versions(self): """Collect all the versions data in a dictionary as the term_versions_data attribute """ for term in self.versions(): self.term_versions_data[term["term_iri"]] = term @property def _version_terms(self): """Get an overview of the terms in the term_versions file """ return set(self.term_versions_data.keys()) def _select_versions_term(self, term_iri): """Select a specific term of the versions data, using term_iri match """ return self.term_versions_data[term_iri] @staticmethod def split_iri(term_iri): """Split an iri field into the namespace url and the local name of the term """ prog = re.compile("(.*/)([^/]*$)") namespace, local_name = prog.findall(term_iri)[0] return namespace, local_name @staticmethod def resolve_namespace_abbrev(namespace): """Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link Parameters ----------- namespace : str valid key of the NAMESPACES variable """ if namespace not in NAMESPACES.keys(): raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES") return NAMESPACES[namespace] def get_term_definition(self, term_iri): """Extract the required information from the terms table to show on the webpage of a single term by using the term_iri as the identifier Notes ------ Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement) """ vs_term = self._select_versions_term(term_iri) term_data = {} term_data["label"] = vs_term['term_localName'] # See https://github.com/tdwg/dwc/issues/253#issuecomment-670098202 term_data["iri"] = term_iri term_data["class"] = vs_term['organized_in'] term_data["definition"] = vs_term['definition'] term_data["comments"] = vs_term['comments'] term_data["examples"] = vs_term['examples'] # term_data["definition"] = self.convert_link(vs_term['definition']) # term_data["comments"] = self.convert_link(self.convert_code(vs_term['comments'])) # term_data["examples"] = self.convert_link(self.convert_code(vs_term['examples'])) term_data["rdf_type"] = vs_term['rdf_type'] namespace_url, _ = self.split_iri(term_iri) term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url) return term_data @staticmethod def convert_code(text_with_backticks): """Takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ... """ return re.sub(r'`([^`]*)`', r'\1', text_with_backticks) @staticmethod def convert_link(text_with_urls): """Takes all links in a text field and converts it to the html tagged version of the link """ def _handle_matched(inputstring): """quick hack version of url handling on the current prime versions data""" url = inputstring.group() return "{}".format(url, url) regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?\n' if group != previous_group: output_file.write(f'\n \n') output_file.write(s) previous_group = group output_file.write("") output_file.close() termlistfile.close() def _getoptions(): ''' Parse command line options and return them.''' parser = argparse.ArgumentParser() help = 'path to the extension term list csv file' parser.add_argument("-i", "--extensiontermsfile", help=help) help = 'path to the extension xml template file' parser.add_argument("-x", "--extensiontemplatefile", help=help) help = 'path to the output extension xml file' parser.add_argument("-o", "--outputfile", help=help) help = 'path to the dwc term versions csv file' parser.add_argument("-t", "--termversionsfile", help=help) return parser.parse_args() def main(): """Build XML Darwin Core Extension files""" options = _getoptions() optdict = {} if options.extensiontermsfile is None or len(options.extensiontermsfile)==0 \ or options.extensiontemplatefile is None or len(options.extensiontemplatefile)==0 \ or options.outputfile is None or len(options.outputfile)==0: s = 'syntax:\n' s += f'python {__filename__}' s += ' -x ./occurrence_core.tmpl' s += ' -i ./occurrence_core_list.csv' s += ' -o ../ext/dwc_occurrence_2021-08-16.xml' s += ' -t ../vocabulary/term_versions.csv' print(s) return term_versions_file = "../vocabulary/term_versions.csv" if options.termversionsfile is not None and len(options.termversionsfile)!=0: term_versions_file = options.termversionsfile print("Running build process:") my_dwc = DwcDigester(term_versions_file) print("Building Extension XML file") xml_template = options.extensiontemplatefile termlist = options.extensiontermsfile file_output = options.outputfile my_dwc.create_extension_xml(xml_template, termlist, file_output) print("Done!") if __name__ == "__main__": sys.exit(main())