#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = "John Wieczorek"
__copyright__ = "Copyright 2021 Rauthiflor LLC"
__filename__ = 'build_extension.py'
__version__ = f'{__filename__} 2021-08-17T20:40-03:00'
import io
import os
import re
import csv
import sys
import codecs
import html
import argparse
from urllib import request
NAMESPACES = {
'http://rs.tdwg.org/dwc/iri/' : 'dwciri',
'http://rs.tdwg.org/dwc/terms/' : 'dwc',
'http://rs.tdwg.org/chrono/terms/' : 'chrono',
'http://purl.org/dc/elements/1.1/' : 'dc',
'http://purl.org/dc/terms/' : 'dcterms',
'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'}
class ProvidedTermsError(Exception):
"""inconsistency in the available terms Error"""
pass
class RdfTypeError(Exception):
"""rdftype encountered that is not known by builder"""
pass
class DwcNamespaceError(Exception):
"""Namespace link is not available in the currently provided links"""
pass
class DwcBuildReader():
def __init__(self, dwc_build_file):
"""Custom Reader switching between raw Github or local file"""
self.dwc_build_file = dwc_build_file
def __enter__(self):
if "https://raw.github" in self.dwc_build_file:
self.open_dwc_term = request.urlopen(self.dwc_build_file)
else:
self.open_dwc_term = open(self.dwc_build_file, 'rb')
return self.open_dwc_term
def __exit__(self, *args):
self.open_dwc_term.close()
class DwcDigester(object):
def __init__(self, term_versions):
"""Digest the term document of Darwin Core to support automatic generation of
derivatives
Parameters
-----------
term_versions : str
Either a relative path and filename of the normative Dwc document
or a URL link to the raw Github version of the file
Notes
-----
Remark that the sequence of the term versions entries is
essential for the automatic generation of the individual documents
(mainly the index.html)
"""
self.term_versions = term_versions
self.term_versions_data = {}
self._store_versions()
# create the defined data-object for the different outputs
self.template_data = self.process_terms()
def versions(self):
"""Iterator providing the terms as represented in the normative term
versions file
"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
def _store_versions(self):
"""Collect all the versions data in a dictionary as the
term_versions_data attribute
"""
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
@property
def _version_terms(self):
"""Get an overview of the terms in the term_versions file
"""
return set(self.term_versions_data.keys())
def _select_versions_term(self, term_iri):
"""Select a specific term of the versions data, using term_iri match
"""
return self.term_versions_data[term_iri]
@staticmethod
def split_iri(term_iri):
"""Split an iri field into the namespace url and the local name
of the term
"""
prog = re.compile("(.*/)([^/]*$)")
namespace, local_name = prog.findall(term_iri)[0]
return namespace, local_name
@staticmethod
def resolve_namespace_abbrev(namespace):
"""Using the NAMESPACE constant, get the namespace abbreviation by
providing the namespace link
Parameters
-----------
namespace : str
valid key of the NAMESPACES variable
"""
if namespace not in NAMESPACES.keys():
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
return NAMESPACES[namespace]
def get_term_definition(self, term_iri):
"""Extract the required information from the terms table to show on
the webpage of a single term by using the term_iri as the identifier
Notes
------
Due to the current implementation, make sure to provide the same keys
represented in the record-level specific version `process_terms`
method (room for improvement)
"""
vs_term = self._select_versions_term(term_iri)
term_data = {}
term_data["label"] = vs_term['term_localName'] # See https://github.com/tdwg/dwc/issues/253#issuecomment-670098202
term_data["iri"] = term_iri
term_data["class"] = vs_term['organized_in']
term_data["definition"] = vs_term['definition']
term_data["comments"] = vs_term['comments']
term_data["examples"] = vs_term['examples']
# term_data["definition"] = self.convert_link(vs_term['definition'])
# term_data["comments"] = self.convert_link(self.convert_code(vs_term['comments']))
# term_data["examples"] = self.convert_link(self.convert_code(vs_term['examples']))
term_data["rdf_type"] = vs_term['rdf_type']
namespace_url, _ = self.split_iri(term_iri)
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
return term_data
@staticmethod
def convert_code(text_with_backticks):
"""Takes all back-quoted sections in a text field and converts it to
the html tagged version of code blocks ...
"""
return re.sub(r'`([^`]*)`', r'\1
', text_with_backticks)
@staticmethod
def convert_link(text_with_urls):
"""Takes all links in a text field and converts it to the html tagged
version of the link
"""
def _handle_matched(inputstring):
"""quick hack version of url handling on the current prime versions data"""
url = inputstring.group()
return "{}".format(url, url)
regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?\n'
if group != previous_group:
output_file.write(f'\n \n')
output_file.write(s)
previous_group = group
output_file.write("")
output_file.close()
termlistfile.close()
def _getoptions():
''' Parse command line options and return them.'''
parser = argparse.ArgumentParser()
help = 'path to the extension term list csv file'
parser.add_argument("-i", "--extensiontermsfile", help=help)
help = 'path to the extension xml template file'
parser.add_argument("-x", "--extensiontemplatefile", help=help)
help = 'path to the output extension xml file'
parser.add_argument("-o", "--outputfile", help=help)
help = 'path to the dwc term versions csv file'
parser.add_argument("-t", "--termversionsfile", help=help)
return parser.parse_args()
def main():
"""Build XML Darwin Core Extension files"""
options = _getoptions()
optdict = {}
if options.extensiontermsfile is None or len(options.extensiontermsfile)==0 \
or options.extensiontemplatefile is None or len(options.extensiontemplatefile)==0 \
or options.outputfile is None or len(options.outputfile)==0:
s = 'syntax:\n'
s += f'python {__filename__}'
s += ' -x ./occurrence_core.tmpl'
s += ' -i ./occurrence_core_list.csv'
s += ' -o ../ext/dwc_occurrence_2021-08-16.xml'
s += ' -t ../vocabulary/term_versions.csv'
print(s)
return
term_versions_file = "../vocabulary/term_versions.csv"
if options.termversionsfile is not None and len(options.termversionsfile)!=0:
term_versions_file = options.termversionsfile
print("Running build process:")
my_dwc = DwcDigester(term_versions_file)
print("Building Extension XML file")
xml_template = options.extensiontemplatefile
termlist = options.extensiontermsfile
file_output = options.outputfile
my_dwc.create_extension_xml(xml_template, termlist, file_output)
print("Done!")
if __name__ == "__main__":
sys.exit(main())