Merge pull request #206 from tdwg/update-build

Update build script
This commit is contained in:
Peter Desmet 2018-10-15 13:39:19 +02:00 committed by GitHub
commit 62af4272c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 2071 additions and 2890 deletions

View File

@ -1,16 +1,25 @@
The build folder contains a python script to rebuild the terms index page.
# Build script
It will use the `vocabulary/term_versions.csv` and the `./config/terms_config.csv` to populate the html-template file (`./config/index.tmpl`)
The build script `build.py` uses as input:
* [vocabulary/term_versions.csv](../vocabulary/term_versions.csv): the list of terms
* [docs/_layouts/terms.tmpl](../docs/_layouts/terms.tmpl): a Jinja2 template for the quick reference guide
To create:
* The quick reference guide as a Markdown file at [docs/terms/index.md](../docs/terms/index.md). The guide is build as Markdown (with a lot of included html) rather than html, so it can incorporated by Jekyll in the Darwin Core website (including a header, footer and table of content).
* Two simple Darwin Core CSV files in [dist/](../dist/)
## Run the build script
Install the required libraries for this script with pip once:
```
pip install -r requirements.txt
```
1. Install the required libraries (once):
To run the script, run the build script from command line:
```bash
pip install -r requirements.txt
```
```
python build.py
```
2. Run the script from the command line:
```bash
python build.py
```

View File

@ -38,7 +38,7 @@ class DwcNamespaceError(Exception):
class DwcBuildReader():
def __init__(self, dwc_build_file):
"""Custom Reader switching between to raw Github or local file"""
"""Custom Reader switching between raw Github or local file"""
self.dwc_build_file = dwc_build_file
def __enter__(self):
@ -54,118 +54,99 @@ class DwcBuildReader():
class DwcDigester(object):
def __init__(self, term_versions, terms_config):
"""digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
def __init__(self, term_versions):
"""Digest the term document of Darwin Core to support automatic
generation of derivatives
Parameters
-----------
term_versions : str
either a relative path and filename of the normative Dwc document or a URL link to the
raw Github version of the file
terms_config : str
either a relative path and filename of the configurations file or a URL link to the
raw Github version of the file
Either a relative path and filename of the normative Dwc document
or a URL link to the raw Github version of the file
Notes
-----
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html)
Remark that the sequence of the term versions entries is
essential for the automatic generation of the individual documents
(mainly the index.html)
"""
self.term_versions = term_versions
self.terms_config = terms_config
self.term_versions_data = {}
self._store_versions()
self.terms_config_data = {}
self._store_configs()
# check for the ability to combine the data
self.match_error_report()
# create the defined data-object for the different outputs
self.template_data = self.process_terms()
def versions(self):
"""iterator providing the terms as represented in the normative term versions file"""
"""Iterator providing the terms as represented in the normative term
versions file
"""
with DwcBuildReader(self.term_versions) as versions:
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
if vterm["status"] == "recommended":
yield vterm
def configs(self):
"""iterator providing the terms as represented in the terms config file
(taking into account the sequence)"""
with DwcBuildReader(self.terms_config) as configs:
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
yield cfterm
def _store_versions(self):
"""collect all the versions data in a dictionary as the term_versions_data attribute"""
"""Collect all the versions data in a dictionary as the
term_versions_data attribute
"""
for term in self.versions():
self.term_versions_data[term["term_iri"]] = term
def _store_configs(self):
"""collect all the config data in a dictionary as the terms_config_data attribute"""
for term in self.configs():
self.terms_config_data[term["term_iri"]] = term
@property
def _version_terms(self):
"""get an overview of the terms in the term_versions file"""
"""Get an overview of the terms in the term_versions file
"""
return set(self.term_versions_data.keys())
@property
def _config_terms(self):
"""get an overview of the terms in the terms config file"""
return set(self.terms_config_data.keys())
def _select_versions_term(self, term_iri):
"""select a specific term of the versions data, using term_iri match"""
"""Select a specific term of the versions data, using term_iri match
"""
return self.term_versions_data[term_iri]
def _select_config_term(self, term_iri):
"""select a specific term of the config data, using term_iri match"""
return self.terms_config_data[term_iri]
def match_error_report(self):
"""check if the prime dwc file and the configurations file provide corresponding terms and inform user on the term differences in between both files"""
overload_versionterms = self._version_terms - self._config_terms
overload_configterms = self._config_terms - self._version_terms
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
". Terms only in terms_config.csv: ", cf_terms]))
@staticmethod
def split_iri(term_iri):
"""split an iri field into the namespace url and the local name of the term"""
"""Split an iri field into the namespace url and the local name
of the term
"""
prog = re.compile("(.*/)([^/]*$)")
namespace, local_name = prog.findall(term_iri)[0]
return namespace, local_name
@staticmethod
def resolve_namespace_abbrev(namespace):
"""Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link"""
"""Using the NAMESPACE constant, get the namespace abbreviation by
providing the namespace link
Parameters
-----------
namespace : str
valid key of the NAMESPACES variable
"""
if namespace not in NAMESPACES.keys():
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
return NAMESPACES[namespace]
def get_term_definition(self, term_iri):
"""Extract the required information from both tables to show on the webpage of a single term
by using the term_iri as the identifier
"""Extract the required information from the terms table to show on
the webpage of a single term by using the term_iri as the identifier
Notes
------
Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement)
Due to the current implementation, make sure to provide the same keys
represented in the record-level specific version `process_terms`
method (room for improvement)
"""
cf_term = self._select_config_term(term_iri)
vs_term = self._select_versions_term(term_iri)
term_data = {}
term_data["label"] = vs_term['label']
term_data["iri"] = term_iri
term_data["class"] = cf_term['organized_in']
term_data["class"] = vs_term['organized_in']
term_data["definition"] = self.convert_link(vs_term['definition'])
term_data["comments"] = self.convert_link(self.convert_code(cf_term['comments']))
term_data["comments"] = self.convert_link(self.convert_code(vs_term['comments']))
term_data["examples"] = self.convert_link(self.convert_code(vs_term['examples']))
term_data["rdf_type"] = vs_term['rdf_type']
namespace_url, _ = self.split_iri(term_iri)
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
@ -173,13 +154,15 @@ class DwcDigester(object):
@staticmethod
def convert_code(text_with_backticks):
"""takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks <code>...</code>
"""Takes all back-quoted sections in a text field and converts it to
the html tagged version of code blocks <code>...</code>
"""
return re.sub(r'`([^`]*)`', r'<code>\1</code>', text_with_backticks)
@staticmethod
def convert_link(text_with_urls):
"""takes all links in a text field and converts it to the html tagged version of the link
"""Takes all links in a text field and converts it to the html tagged
version of the link
"""
def _handle_matched(inputstring):
"""quick hack version of url handling on the current prime versions data"""
@ -190,11 +173,17 @@ class DwcDigester(object):
return re.sub(regx, _handle_matched, text_with_urls)
def process_terms(self):
"""parse the config terms (sequence matters!), collect all required data from both the normative versions file and the config file and return the template ready data.
"""Parse the config terms (sequence matters!)
Collect all required data from both the normative versions file and
the config file and return the template ready data.
Returns
-------
Data object that can be digested by the html-template file. Contains the term data formatted to create the indidivual outputs, each list element is a dictionary representing a class group. Hence, the data object is structured as follows:
Data object that can be digested by the html-template file. Contains
the term data formatted to create the indidivual outputs, each list
element is a dictionary representing a class group. Hence, the data
object is structured as follows:
[
{'name' : class_group_name_1, 'label': xxxx,...,
@ -222,7 +211,7 @@ class DwcDigester(object):
class_group["terms"] = []
class_group["namespace"] = None
for term in self.configs(): # sequence of the config file used as order
for term in self.versions(): # sequence of the terms file used as order
term_data = self.get_term_definition(term['term_iri'])
# new class encountered
if term_data["rdf_type"] == "http://www.w3.org/2000/01/rdf-schema#Class":
@ -240,7 +229,8 @@ class DwcDigester(object):
def create_html(self, html_template="../docs/_layouts/terms.tmpl",
html_output="../docs/terms/index.md"):
"""build html with the processed term info, by filling in the tmpl-template
"""build html with the processed term info, by filling in the
tmpl-template
Parameters
-----------
@ -254,7 +244,10 @@ class DwcDigester(object):
data = {}
data["class_groups"] = self.template_data
env = Environment(loader = FileSystemLoader(os.path.dirname(html_template)))
env = Environment(
loader = FileSystemLoader(os.path.dirname(html_template)),
trim_blocks = True
)
template = env.get_template(os.path.basename(html_template))
html = template.render(data)
@ -263,9 +256,11 @@ class DwcDigester(object):
index_page.close()
def simple_dwc_terms(self):
"""only extract those terms that are simple dwc, defined as `simple` in the flags column of the config file of terms"""
"""Only extract those terms that are simple dwc, defined as `simple`
in the flags column of the config file of terms
"""
properties = []
for term in self.configs():
for term in self.versions():
term_data = self.get_term_definition(term['term_iri'])
if (term_data["rdf_type"] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property" and
term["flags"] == "simple"):
@ -273,7 +268,7 @@ class DwcDigester(object):
return properties
def create_dwc_list(self, file_output="../dist/simple_dwc_vertical.csv"):
"""build a list of simple dwc terms and write it to file
"""Build a list of simple dwc terms and write it to file
Parameters
-----------
@ -285,7 +280,7 @@ class DwcDigester(object):
dwc_list_file.write(term + "\n")
def create_dwc_header(self, file_output="../dist/simple_dwc_horizontal.csv"):
"""build a header of simple dwc terms and write it to file
"""Build a header of simple dwc terms and write it to file
Parameters
-----------
@ -300,11 +295,10 @@ class DwcDigester(object):
def main():
"""Building up the quick reference html and derivatives"""
config_terms_file = "./config/terms.csv"
term_versions_file = "../vocabulary/term_versions.csv"
print("Running build process:")
my_dwc = DwcDigester(term_versions_file, config_terms_file)
my_dwc = DwcDigester(term_versions_file)
print("Building quick reference guide")
my_dwc.create_html()
print("Building simple DwC CSV files")

View File

@ -10,33 +10,38 @@ This page provides a list of all currently recommended terms of the Darwin Core
{% for class_group in class_groups %}
## {{ class_group.label }}
<div class="my-4">
{% for term in class_group.terms %}
<a class="btn btn-sm btn-outline-secondary m-1" href="#{{ term.namespace}}:{{ term.label }}">{{ term.label }}</a>
<a class="btn btn-sm btn-outline-secondary m-1" href="#{{ term.namespace }}:{{ term.label }}">{{ term.label }}</a>
{% endfor %}
</div>
{% if class_group.iri is not none %}
{# The above will exclude the Record-level class_group #}
<table class="table table-sm border mb-3">
{# Class (the above will exclude the Record-level class_group) #}
<table class="table table-sm table-bordered">
<tbody>
<tr class="table-primary"><th colspan="2">{{ class_group.label }}<span class="badge badge-primary float-right">Class</span></th></tr>
<tr><td class="label">Identifier</td><td><a href="{{ class_group.iri }}">{{ class_group.iri }}</a></td></tr>
<tr><td class="label">Definition</td><td>{{class_group.definition}}</td></tr>
<tr><td class="label">Comments</td><td>{{class_group.comments}}</td></tr>
<tr><td class="label">Definition</td><td>{{ class_group.definition }}</td></tr>
<tr><td class="label">Comments</td><td>{{ class_group.comments }}</td></tr>
<tr><td class="label">Examples</td><td>{{ class_group.examples }}</td></tr>
</tbody>
</table>
{%endif %}
{% for term in class_group.terms %}
<a id="{{ term.namespace}}:{{ term.label }}" class="anchor"></a>
<table class="table table-sm border mb-3">
{# Term #}
<p class="invisible">
<a id="{{ term.namespace }}:{{ term.label }}"></a>{% if term.namespace != "dwciri" %}<a id="{{ term.label }}"></a>{% endif %}
</p>
<table class="table table-sm table-bordered">
<tbody>
<tr class="table-secondary"><th colspan="2">{{ term.label }}<span class="badge badge-secondary float-right">Property</span></th></tr>
<tr><td class="label">Identifier</td><td><a href="{{ term.iri }}">{{ term.iri }}</a></td></tr>
<tr><td class="label">Definition</td><td>{{ term.definition }}</td></tr>
<tr><td class="label">Comments</td><td>{{ term.comments }}</td></tr>
<tr><td class="label">Examples</td><td>{{ term.examples }}</td></tr>
</tbody>
</table>
{% endfor %}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff