mirror of https://github.com/tdwg/dwc.git
Merge branch 'master' into restructure-repo
Conflicts: build/build.py
This commit is contained in:
commit
439462ea75
185
build/build.py
185
build/build.py
|
@ -5,22 +5,34 @@
|
|||
#
|
||||
|
||||
import io
|
||||
import re
|
||||
import csv
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
from urllib import request
|
||||
|
||||
from Cheetah.Template import Template
|
||||
|
||||
NAMESPACES = {
|
||||
'http://rs.tdwg.org/dwc/iri/' : 'dwciri',
|
||||
'http://rs.tdwg.org/dwc/terms/' : 'dwc',
|
||||
'http://purl.org/dc/elements/1.1/' : 'dc',
|
||||
'http://purl.org/dc/terms/' : 'dcterms',
|
||||
'http://rs.tdwg.org/dwc/terms/attributes/' : 'tdwgutility'}
|
||||
|
||||
|
||||
class ProvidedTermsError(Exception):
|
||||
"""Inconsistency in the available terms Error"""
|
||||
"""inconsistency in the available terms Error"""
|
||||
pass
|
||||
|
||||
|
||||
class RdfTypeError(Exception):
|
||||
"""Rdftype encountered that is not known by builder"""
|
||||
"""rdftype encountered that is not known by builder"""
|
||||
pass
|
||||
|
||||
class DwcNamespaceError(Exception):
|
||||
"""Namespace link is not available in the currently provided links"""
|
||||
pass
|
||||
|
||||
class DwcBuildReader():
|
||||
|
||||
|
@ -42,89 +54,146 @@ class DwcBuildReader():
|
|||
class DwcDigester(object):
|
||||
|
||||
def __init__(self, term_versions, terms_config):
|
||||
"""digest the normative document of Darwin Core and the configurations file to support automatic generation of derivatives
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
term_versions : str
|
||||
either a relative path and filename of the normative Dwc document or a URL link to the
|
||||
raw Github version of the file
|
||||
terms_config : str
|
||||
either a relative path and filename of the configurations file or a URL link to the
|
||||
raw Github version of the file
|
||||
|
||||
Notes
|
||||
-----
|
||||
Remark that the sequence of the configurations file entries is essential for the automatic generation of the individual documents (mainly the index.html)
|
||||
"""
|
||||
self.term_versions = term_versions
|
||||
self.terms_config = terms_config
|
||||
|
||||
self.term_versions_data = {}
|
||||
self._store_versions()
|
||||
self.terms_config_data = {}
|
||||
self._store_config()
|
||||
self._store_configs()
|
||||
|
||||
# check for the ability to combine the data
|
||||
self.match_error_report()
|
||||
|
||||
# create the defined data-object for the different outputs
|
||||
self.template_data = self.process_terms()
|
||||
|
||||
def versions(self):
|
||||
"""iterator to provide the terms as represented in the term versions file"""
|
||||
"""iterator providing the terms as represented in the normative term versions file"""
|
||||
with DwcBuildReader(self.term_versions) as versions:
|
||||
for vterm in csv.DictReader(io.TextIOWrapper(versions), delimiter=','):
|
||||
if vterm["status"] == "recommended":
|
||||
yield vterm
|
||||
|
||||
def config(self):
|
||||
"""iterator to provide the terms as represented in the terms config file
|
||||
(including the sequence)"""
|
||||
def configs(self):
|
||||
"""iterator providing the terms as represented in the terms config file
|
||||
(taking into account the sequence)"""
|
||||
with DwcBuildReader(self.terms_config) as configs:
|
||||
for cfterm in csv.DictReader(io.TextIOWrapper(configs), delimiter=','):
|
||||
yield cfterm
|
||||
|
||||
def _store_versions(self):
|
||||
"""collect all the versions data in a dictionary"""
|
||||
"""collect all the versions data in a dictionary as the term_versions_data attribute"""
|
||||
for term in self.versions():
|
||||
self.term_versions_data[term["term_iri"]] = term
|
||||
|
||||
def _store_config(self):
|
||||
"""collect all the config data in a dictionary"""
|
||||
for term in self.config():
|
||||
def _store_configs(self):
|
||||
"""collect all the config data in a dictionary as the terms_config_data attribute"""
|
||||
for term in self.configs():
|
||||
self.terms_config_data[term["term_iri"]] = term
|
||||
|
||||
@property
|
||||
def _version_terms(self):
|
||||
"""get an overview of the terms in the term_versions file"""
|
||||
return set(self.term_versions_data.keys())
|
||||
|
||||
@property
|
||||
def _config_terms(self):
|
||||
"""get an overview of the terms in the terms config file"""
|
||||
return set(self.terms_config_data.keys())
|
||||
|
||||
def _select_versions_term(self, term_iri):
|
||||
"""Select a specific term of the versions data, using term_iri match"""
|
||||
"""select a specific term of the versions data, using term_iri match"""
|
||||
return self.term_versions_data[term_iri]
|
||||
|
||||
def _select_config_term(self, term_iri):
|
||||
"""Select a specific term of the config data, using term_iri match"""
|
||||
"""select a specific term of the config data, using term_iri match"""
|
||||
return self.terms_config_data[term_iri]
|
||||
|
||||
def match_error_report(self):
|
||||
"""check if the prime dwc file and the humaninfo file provide corresponding terms"""
|
||||
overload_versionterms = self._version_terms() - self._config_terms()
|
||||
overload_configterms = self._config_terms() - self._version_terms()
|
||||
"""check if the prime dwc file and the configurations file provide corresponding terms and inform user on the term differences in between both files"""
|
||||
overload_versionterms = self._version_terms - self._config_terms
|
||||
overload_configterms = self._config_terms - self._version_terms
|
||||
if len(overload_versionterms) > 0 or len(overload_configterms) > 0:
|
||||
vs_terms = ", ".join([term.split("/")[-1] for term in overload_versionterms])
|
||||
cf_terms = ", ".join([term.split("/")[-1] for term in overload_configterms])
|
||||
raise ProvidedTermsError("".join(["Terms only in term_versions.csv: ", vs_terms,
|
||||
". Terms only in terms_config.csv: ", cf_terms]))
|
||||
@staticmethod
|
||||
def split_iri(term_iri):
|
||||
"""split an iri field into the namespace url and the term itself"""
|
||||
prog = re.compile("(.*/)([^/]*$)")
|
||||
namespace, term = prog.findall(term_iri)[0]
|
||||
return namespace, term
|
||||
|
||||
def get_term_definition(self, term):
|
||||
"""Extract the required information to show on the webpage of a single term """
|
||||
cf_term = term
|
||||
vs_term = self._select_versions_term(term["term_iri"])
|
||||
term_iri = term['term_iri']
|
||||
@staticmethod
|
||||
def resolve_namespace_abbrev(namespace):
|
||||
"""Using the NAMESPACE constant, get the namespace abbreviation by providing the namespace link"""
|
||||
if namespace not in NAMESPACES.keys():
|
||||
raise DwcNamespaceError("The namespace url is currently not supported in NAMESPACES")
|
||||
return NAMESPACES[namespace]
|
||||
|
||||
def get_term_definition(self, term_iri):
|
||||
"""Extract the required information from both tables to show on the webpage of a single term
|
||||
by using the term_iri as the identifier
|
||||
|
||||
Notes
|
||||
------
|
||||
Due to the current implementation, make sure to provide the same keys represented in the record-level specific version `process_terms` method (room for improvement)
|
||||
"""
|
||||
cf_term = self._select_config_term(term_iri)
|
||||
vs_term = self._select_versions_term(term_iri)
|
||||
|
||||
term_data = {}
|
||||
term_data["name"] = term_iri.split("/")[-1]
|
||||
_, term_data["name"] = self.split_iri(term_iri)
|
||||
term_data["iri"] = term_iri
|
||||
term_data["label"] = vs_term['label']
|
||||
term_data["class"] = cf_term['organized_in']
|
||||
term_data["definition"] = vs_term['definition']
|
||||
term_data["comments"] = cf_term['comments']
|
||||
term_data["rdf_type"] = vs_term['rdf_type']
|
||||
namespace_url, _ = self.split_iri(term_iri)
|
||||
term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url)
|
||||
return term_data
|
||||
|
||||
def process_terms(self):
|
||||
"""parse the config terms towards the structure required for the HTML template"""
|
||||
"""parse the config terms (sequence matters!), collect all required data from both the normative versions file and the config file and return the template ready data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Data object that can be digested by the html-templatye file. Contains the term data formatted to create the indidivual outputs, each list element is a dictionary representing a class group. Hence, the data object is structured as follows:
|
||||
|
||||
[
|
||||
{'name' : class_group_name_1, 'label': xxxx,...,
|
||||
'terms':
|
||||
[
|
||||
{'name' : term_1, 'label': xxxx,...},
|
||||
{'name' : term_2, 'label': xxxx,...},
|
||||
...
|
||||
]}
|
||||
{'name' : class_group_name_2,...
|
||||
...},
|
||||
...
|
||||
]
|
||||
"""
|
||||
template_data = []
|
||||
in_class = "Record-level"
|
||||
# sequence matters in config and it starts with Record-level
|
||||
# sequence matters in config and it starts with Record-level which we populate here ad-hoc
|
||||
class_group = {}
|
||||
class_group["name"] = "Record-level"
|
||||
class_group["iri"] = None
|
||||
|
@ -134,8 +203,10 @@ class DwcDigester(object):
|
|||
class_group["comments"] = None
|
||||
class_group["rdf_type"] = None
|
||||
class_group["terms"] = []
|
||||
for term in self.config(): # sequence of the config file used as order
|
||||
term_data = self.get_term_definition(term)
|
||||
class_group["namespace"] = "Record-level"
|
||||
|
||||
for term in self.configs(): # sequence of the config file used as order
|
||||
term_data = self.get_term_definition(term['term_iri'])
|
||||
# new class encountered
|
||||
if term_data["rdf_type"] == "http://www.w3.org/2000/01/rdf-schema#Class":
|
||||
# store previous section in template_data
|
||||
|
@ -150,21 +221,64 @@ class DwcDigester(object):
|
|||
template_data.append(class_group)
|
||||
return template_data
|
||||
|
||||
@staticmethod
|
||||
def create_html(template_data, html_template="./config/index.tmpl", html_output="../www/guides/index.html"):
|
||||
"""build html with the processed term info"""
|
||||
def create_html(self, html_template="./config/index.tmpl",
|
||||
html_output="../www/guides/index.html"):
|
||||
"""build html with the processed term info, by filling in the tmpl-template
|
||||
|
||||
data={}
|
||||
data["groups"] = template_data
|
||||
Parameters
|
||||
-----------
|
||||
html_template : str
|
||||
relative path and filename to the [Cheetah3](http://cheetahtemplate.org/) compatible
|
||||
template
|
||||
html_output : str
|
||||
relative path and filename to write the resulting index.html
|
||||
"""
|
||||
|
||||
data = {}
|
||||
data["groups"] = self.template_data
|
||||
html = Template(file=html_template, searchList=[data])
|
||||
|
||||
index_page = open(html_output, "w")
|
||||
index_page.write(str(html))
|
||||
index_page.close()
|
||||
|
||||
def simple_dwc_terms(self):
|
||||
"""only extract those terms that are simple dwc, defined as `simple` in the flags column of the config file of terms"""
|
||||
properties = []
|
||||
for term in self.configs():
|
||||
term_data = self.get_term_definition(term['term_iri'])
|
||||
if (term_data["rdf_type"] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property" and
|
||||
term["flags"] == "simple"):
|
||||
properties.append(term_data["name"])
|
||||
return properties
|
||||
|
||||
def create_dwc_list(self, file_output="../dist/simple_dwc_vertical.csv"):
|
||||
"""build a list of simple dwc terms and write it to file
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
file_output : str
|
||||
relative path and filename to write the resulting list
|
||||
"""
|
||||
with codecs.open(file_output, 'w', 'utf-8') as dwc_list_file:
|
||||
for term in self.simple_dwc_terms():
|
||||
dwc_list_file.write(term + "\n")
|
||||
|
||||
def create_dwc_header(self, file_output="../dist/simple_dwc_horizontal.csv"):
|
||||
"""build a header of simple dwc terms and write it to file
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
file_output : str
|
||||
relative path and filename to write the resulting list
|
||||
"""
|
||||
with codecs.open(file_output, 'w', 'utf-8') as dwc_header_file:
|
||||
properties = self.simple_dwc_terms()
|
||||
dwc_header_file.write(",".join(properties))
|
||||
dwc_header_file.write("\n")
|
||||
|
||||
def main():
|
||||
"""Building up the html"""
|
||||
"""Building up the quick reference html and derivatives"""
|
||||
|
||||
config_terms_file = "./config/terms.csv"
|
||||
term_versions_file = "../standard/vocabularies/term_versions.csv"
|
||||
|
@ -172,7 +286,10 @@ def main():
|
|||
print("Running build process using current term_versions and config_terms file...")
|
||||
my_dwc = DwcDigester(term_versions_file, config_terms_file)
|
||||
print("Building index html file...")
|
||||
my_dwc.create_html(my_dwc.process_terms())
|
||||
my_dwc.create_html()
|
||||
print("Building simple dwc list and header...")
|
||||
my_dwc.create_dwc_list()
|
||||
my_dwc.create_dwc_header()
|
||||
print("...done!")
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
type,modified,language,license,rightsHolder,accessRights,bibliographicCitation,references,institutionID,collectionID,datasetID,institutionCode,collectionCode,datasetName,ownerInstitutionCode,basisOfRecord,informationWithheld,dataGeneralizations,dynamicProperties,occurrenceID,catalogNumber,recordNumber,recordedBy,individualCount,organismQuantity,organismQuantityType,sex,lifeStage,reproductiveCondition,behavior,establishmentMeans,occurrenceStatus,preparations,disposition,associatedMedia,associatedReferences,associatedSequences,associatedTaxa,otherCatalogNumbers,occurrenceRemarks,organismID,organismName,organismScope,associatedOccurrences,associatedOrganisms,previousIdentifications,organismRemarks,materialSampleID,eventID,parentEventID,fieldNumber,eventDate,eventTime,startDayOfYear,endDayOfYear,year,month,day,verbatimEventDate,habitat,samplingProtocol,sampleSizeValue,sampleSizeUnit,samplingEffort,fieldNotes,eventRemarks,locationID,higherGeographyID,higherGeography,continent,waterBody,islandGroup,island,country,countryCode,stateProvince,county,municipality,locality,verbatimLocality,minimumElevationInMeters,maximumElevationInMeters,verbatimElevation,minimumDepthInMeters,maximumDepthInMeters,verbatimDepth,minimumDistanceAboveSurfaceInMeters,maximumDistanceAboveSurfaceInMeters,locationAccordingTo,locationRemarks,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,coordinatePrecision,pointRadiusSpatialFit,verbatimCoordinates,verbatimLatitude,verbatimLongitude,verbatimCoordinateSystem,verbatimSRS,footprintWKT,footprintSRS,footprintSpatialFit,georeferencedBy,georeferencedDate,georeferenceProtocol,georeferenceSources,georeferenceVerificationStatus,georeferenceRemarks,geologicalContextID,earliestEonOrLowestEonothem,latestEonOrHighestEonothem,earliestEraOrLowestErathem,latestEraOrHighestErathem,earliestPeriodOrLowestSystem,latestPeriodOrHighestSystem,earliestEpochOrLowestSeries,latestEpochOrHighestSeries,earliestAgeOrLowestStage,latestAgeOrHighestStage,lowestBiostratigraphicZone,highestBiostratigraphicZone,lithostratigraphicTerms,group,formation,member,bed,identificationID,identificationQualifier,typeStatus,identifiedBy,dateIdentified,identificationReferences,identificationVerificationStatus,identificationRemarks,taxonID,scientificNameID,acceptedNameUsageID,parentNameUsageID,originalNameUsageID,nameAccordingToID,namePublishedInID,taxonConceptID,scientificName,acceptedNameUsage,parentNameUsage,originalNameUsage,nameAccordingTo,namePublishedIn,namePublishedInYear,higherClassification,kingdom,phylum,class,order,family,genus,subgenus,specificEpithet,infraspecificEpithet,taxonRank,verbatimTaxonRank,scientificNameAuthorship,vernacularName,nomenclaturalCode,taxonomicStatus,nomenclaturalStatus,taxonRemarks
|
|
|
@ -22,6 +22,8 @@ catalogNumber
|
|||
recordNumber
|
||||
recordedBy
|
||||
individualCount
|
||||
organismQuantity
|
||||
organismQuantityType
|
||||
sex
|
||||
lifeStage
|
||||
reproductiveCondition
|
||||
|
@ -45,6 +47,7 @@ previousIdentifications
|
|||
organismRemarks
|
||||
materialSampleID
|
||||
eventID
|
||||
parentEventID
|
||||
fieldNumber
|
||||
eventDate
|
||||
eventTime
|
||||
|
@ -56,6 +59,8 @@ day
|
|||
verbatimEventDate
|
||||
habitat
|
||||
samplingProtocol
|
||||
sampleSizeValue
|
||||
sampleSizeUnit
|
||||
samplingEffort
|
||||
fieldNotes
|
||||
eventRemarks
|
||||
|
@ -162,19 +167,3 @@ nomenclaturalCode
|
|||
taxonomicStatus
|
||||
nomenclaturalStatus
|
||||
taxonRemarks
|
||||
resourceRelationshipID
|
||||
resourceID
|
||||
relatedResourceID
|
||||
relationshipOfResource
|
||||
relationshipAccordingTo
|
||||
relationshipEstablishedDate
|
||||
relationshipRemarks
|
||||
measurementID
|
||||
measurementType
|
||||
measurementValue
|
||||
measurementAccuracy
|
||||
measurementUnit
|
||||
measurementDeterminedBy
|
||||
measurementDeterminedDate
|
||||
measurementMethod
|
||||
measurementRemarks
|
|
|
@ -1 +0,0 @@
|
|||
"type","modified","language","license","rightsHolder","accessRights","bibliographicCitation","references","institutionID","collectionID","datasetID","institutionCode","collectionCode","datasetName","ownerInstitutionCode","basisOfRecord","informationWithheld","dataGeneralizations","dynamicProperties","occurrenceID","catalogNumber","recordNumber","recordedBy","individualCount","sex","lifeStage","reproductiveCondition","behavior","establishmentMeans","occurrenceStatus","preparations","disposition","associatedMedia","associatedReferences","associatedSequences","associatedTaxa","otherCatalogNumbers","occurrenceRemarks","organismID","organismName","organismScope","associatedOccurrences","associatedOrganisms","previousIdentifications","organismRemarks","materialSampleID","eventID","fieldNumber","eventDate","eventTime","startDayOfYear","endDayOfYear","year","month","day","verbatimEventDate","habitat","samplingProtocol","samplingEffort","fieldNotes","eventRemarks","locationID","higherGeographyID","higherGeography","continent","waterBody","islandGroup","island","country","countryCode","stateProvince","county","municipality","locality","verbatimLocality","minimumElevationInMeters","maximumElevationInMeters","verbatimElevation","minimumDepthInMeters","maximumDepthInMeters","verbatimDepth","minimumDistanceAboveSurfaceInMeters","maximumDistanceAboveSurfaceInMeters","locationAccordingTo","locationRemarks","decimalLatitude","decimalLongitude","geodeticDatum","coordinateUncertaintyInMeters","coordinatePrecision","pointRadiusSpatialFit","verbatimCoordinates","verbatimLatitude","verbatimLongitude","verbatimCoordinateSystem","verbatimSRS","footprintWKT","footprintSRS","footprintSpatialFit","georeferencedBy","georeferencedDate","georeferenceProtocol","georeferenceSources","georeferenceVerificationStatus","georeferenceRemarks","geologicalContextID","earliestEonOrLowestEonothem","latestEonOrHighestEonothem","earliestEraOrLowestErathem","latestEraOrHighestErathem","earliestPeriodOrLowestSystem","latestPeriodOrHighestSystem","earliestEpochOrLowestSeries","latestEpochOrHighestSeries","earliestAgeOrLowestStage","latestAgeOrHighestStage","lowestBiostratigraphicZone","highestBiostratigraphicZone","lithostratigraphicTerms","group","formation","member","bed","identificationID","identificationQualifier","typeStatus","identifiedBy","dateIdentified","identificationReferences","identificationVerificationStatus","identificationRemarks","taxonID","scientificNameID","acceptedNameUsageID","parentNameUsageID","originalNameUsageID","nameAccordingToID","namePublishedInID","taxonConceptID","scientificName","acceptedNameUsage","parentNameUsage","originalNameUsage","nameAccordingTo","namePublishedIn","namePublishedInYear","higherClassification","kingdom","phylum","class","order","family","genus","subgenus","specificEpithet","infraspecificEpithet","taxonRank","verbatimTaxonRank","scientificNameAuthorship","vernacularName","nomenclaturalCode","taxonomicStatus","nomenclaturalStatus","taxonRemarks","resourceRelationshipID","resourceID","relatedResourceID","relationshipOfResource","relationshipAccordingTo","relationshipEstablishedDate","relationshipRemarks","measurementID","measurementType","measurementValue","measurementAccuracy","measurementUnit","measurementDeterminedBy","measurementDeterminedDate","measurementMethod","measurementRemarks"
|
|
Loading…
Reference in New Issue