dwc/build/build.py

109 lines
3.7 KiB
Python
Raw Normal View History

#
# S. Van Hoey
#
# Build script for tdwg dwc handling
#
import io
import csv
from urllib import request
class ProvidedTermsError(Exception):
"""Inconsistency in the available terms Error"""
pass
2017-09-30 14:48:27 +00:00
class RdfTypeError(Exception):
"""Rdftype encountered that is not known by builder"""
pass
class DwcBuildReader():
2017-09-30 14:48:27 +00:00
def __init__(self, dwc_build_file):
"""Custom Reader switching between to raw Github or local file"""
self.dwc_build_file = dwc_build_file
2017-09-30 14:48:27 +00:00
def __enter__(self):
if "https://raw.github" in self.dwc_build_file:
self.open_dwc_term = request.urlopen(self.dwc_build_file)
else:
self.open_dwc_term = open(self.dwc_build_file, 'rb')
return self.open_dwc_term
def __exit__(self, *args):
self.open_dwc_term.close()
class DwcDigester(object):
2017-09-30 14:48:27 +00:00
def __init__(self, term_versions, terms_config):
2017-09-30 14:48:27 +00:00
self.term_versions = term_versions
self.terms_config = terms_config
self.term_versions_data = {}
self._store_versions()
self.terms_config_data = {}
self._store_config()
# check for the ability to combine the data
self.match_error_report()
def versions(self):
"""iterator to provide the terms as represented in the term versions file"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
def config(self):
2017-09-30 14:48:27 +00:00
"""iterator to provide the terms as represented in the terms config file
(including the sequence)"""
with DwcBuildReader(self.terms_config) as configs:
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
yield cfterm
2017-09-30 14:48:27 +00:00
def _store_versions(self):
"""collect all the versions data in a dictionary"""
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
def _store_config(self):
"""collect all the config data in a dictionary"""
for term in self.config():
self.terms_config_data[term["term_iri"]] = term
def _version_terms(self):
"""get an overview of the terms in the term_versions file"""
2017-09-30 14:48:27 +00:00
return set(self.term_versions_data.keys())
def _config_terms(self):
"""get an overview of the terms in the terms config file"""
2017-09-30 14:48:27 +00:00
return set(self.terms_config_data.keys())
def _select_versions_term(self, term_iri):
"""Select a specific term of the versions data, using term_iri match"""
return self.term_versions_data[term_iri]
def _select_config_term(self, term_iri):
"""Select a specific term of the config data, using term_iri match"""
return self.terms_config_data[term_iri]
def match_error_report(self):
"""check if the prime dwc file and the humaninfo file provide corresponding terms"""
overload_versionterms = self._version_terms() - self._config_terms()
overload_configterms = self._config_terms() - self._version_terms()
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
". Terms only in terms_config.csv: ", cf_terms]))
def process_terms(self):
"""parse the config terms towards the structure required for the HTML template"""
for term in self.config():
print(term)