mirror of https://github.com/tdwg/dwc.git
169 lines
5.8 KiB
Python
Executable File
169 lines
5.8 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
'''
|
|
Install the required libraries for this script with PIP:
|
|
$ easy_install pip
|
|
$ sudo pip install -r requirements.txt
|
|
|
|
To run the script just run it from inside the build folder:
|
|
$ ./update_terms.py
|
|
'''
|
|
|
|
import yaml, re, csv, codecs
|
|
from Cheetah.Template import Template
|
|
from rdflib import Graph, URIRef, Namespace, Literal
|
|
from rdflib.namespace import RDFS
|
|
|
|
DWC=Namespace("http://rs.tdwg.org/dwc/terms/")
|
|
DC=Namespace("http://purl.org/dc/terms/")
|
|
DWCA=Namespace("http://rs.tdwg.org/dwc/terms/attributes/")
|
|
REC_LEVEL=DWC.term("Record-level")
|
|
|
|
|
|
def buildHtml(groups):
|
|
print """building html files"""
|
|
data={}
|
|
data["groups"]=groups
|
|
html = Template(file="terms.tmpl", searchList=[data])
|
|
recommended = open("../terms/index.html", "w")
|
|
recommended.write(str(html))
|
|
recommended.close()
|
|
|
|
def buildDownloads(groups):
|
|
print """building dwc_terms.csv"""
|
|
with open('../resources/dwc_terms.csv', 'w') as csvf:
|
|
writer=csv.writer(csvf, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
|
|
writer.writerow(['TermName', 'URI', "Label_en", "Group", "Definition", "Comments"])
|
|
for g in groups:
|
|
for t in g["terms"]:
|
|
writer.writerow([t["name"], t["uri"], utf8(t["label"]), t["class"], utf8(t["definition"]), utf8(t["comment"])])
|
|
print """building simple_dwc_terms_list.csv"""
|
|
with codecs.open('../resources/simple_dwc_terms_list.csv', 'w', 'utf-8') as f:
|
|
for g in groups:
|
|
for t in g["terms"]:
|
|
f.write(t["name"] + "\n")
|
|
print """building simple_dwc.properties"""
|
|
with codecs.open('../resources/simple_dwc.properties', 'w', 'utf-8') as f:
|
|
for g in groups:
|
|
for t in g["terms"]:
|
|
term=t["name"]
|
|
f.write("%s.name=%s\n" % (term,term))
|
|
f.write("%s.uri=%s\n" % (term,t["uri"]))
|
|
f.write("%s.label=%s\n" % (term, n2e(t["label"])))
|
|
f.write("%s.definition=%s\n" % (term, n2e(t["definition"])))
|
|
f.write("%s.comment=%s\n" % (term, n2e(t["comment"])))
|
|
print """building simple_dwc_terms_header.csv"""
|
|
with codecs.open('../resources/simple_dwc_terms_header.csv', 'w', 'utf-8') as f:
|
|
started=False
|
|
for g in groups:
|
|
for t in g["terms"]:
|
|
if started:
|
|
f.write(",")
|
|
f.write('"'+t["name"]+'"')
|
|
started=True
|
|
f.write("\n")
|
|
print """building simple_dwc_pgsql.sql"""
|
|
with open('term_type.yaml', 'r') as f:
|
|
types = yaml.load(f)
|
|
with codecs.open('../resources/simple_dwc_pgsql.sql', 'w', 'utf-8') as f:
|
|
started=False
|
|
f.write("CREATE TABLE dwc (\n")
|
|
for g in groups:
|
|
for t in g["terms"]:
|
|
if started:
|
|
f.write(",\n")
|
|
f.write(' "%s" ' % t["name"])
|
|
f.write(types.get(t["name"], "text"))
|
|
started=True
|
|
f.write("\n);\n")
|
|
|
|
def n2e(x):
|
|
if x is None:
|
|
return ""
|
|
return x
|
|
|
|
def utf8(x):
|
|
if x is None:
|
|
return x
|
|
return x.encode("utf-8")
|
|
|
|
def anchorLinks(x):
|
|
if x is None:
|
|
return x
|
|
return re.sub('(https?://\S+[a-zA-Z0-9/_-])', "<a href='\\1'>\\1</a>", x)
|
|
|
|
def getTermDef(name, g):
|
|
t={}
|
|
if name.startswith("DC_"):
|
|
name=name[3:]
|
|
t["name"]=name
|
|
t["name_prefixed"]="dcterms:"+name
|
|
uri=DC.term(name)
|
|
else:
|
|
t["name"]=name
|
|
t["name_prefixed"]="dwc:"+name
|
|
uri=DWC.term(name)
|
|
if uri==REC_LEVEL:
|
|
uri=None
|
|
t["uri"]=uri
|
|
if uri is not None:
|
|
t["label"]=g.value(subject=uri, predicate=RDFS.label)
|
|
t["class"]=g.value(subject=uri, predicate=DWCA.organizedInClass)
|
|
t["definition"]=anchorLinks(g.value(subject=uri, predicate=RDFS.comment))
|
|
t["comment"]=anchorLinks(g.value(subject=uri, predicate=DC.description))
|
|
t["version"]=g.value(subject=uri, predicate=DC.hasVersion)
|
|
if t["definition"] is None:
|
|
raise AssertionError("Unknown term definition "+str(uri))
|
|
return t
|
|
|
|
def parseTerms():
|
|
g = Graph()
|
|
g.parse("../terms/dwc_normative.rdf")
|
|
# we remove the abstract dwc term accordingTo
|
|
g.remove((DWC.accordingTo,None,None))
|
|
g.remove((DWC+"",None,None))
|
|
if (DWC+"", None, None) in g:
|
|
raise AssertionError("DWC NS in here")
|
|
with open('dc.yaml', 'r') as dcf:
|
|
dc = yaml.load(dcf)
|
|
for t in dc:
|
|
uri=DC[t]
|
|
g.add( (uri, RDFS.comment, Literal(dc[t]["definition"])) )
|
|
g.add( (uri, DC.description, Literal(dc[t]["comment"])) )
|
|
g.add( (uri, DC.hasVersion, URIRef(dc[t]["details"])) )
|
|
with open('term_order.yaml', 'r') as f:
|
|
terms = yaml.load(f)
|
|
data={}
|
|
groups=[]
|
|
for group in terms:
|
|
groupTerm=getTermDef(sorted(group.keys())[0], g)
|
|
groupTerm["terms"]=[]
|
|
if group.values() is not None and sorted(group.values())[0] is not None:
|
|
for t in sorted(group.values())[0]:
|
|
groupTerm["terms"].append(getTermDef(t, g))
|
|
groups.append(groupTerm)
|
|
# finally verify we have all terms covered in both the order yaml and the graph
|
|
verifyCompleteness(g, groups)
|
|
return groups
|
|
|
|
def verifyCompleteness(graph, groups):
|
|
terms={}
|
|
for g in groups:
|
|
if "uri" in g:
|
|
terms[str(g["uri"])]=1
|
|
for t in g["terms"]:
|
|
terms[str(t["uri"])]=1
|
|
print """%s terms defined""" % len(terms)
|
|
for s in graph.subjects():
|
|
if s not in (DWC.accordingTo, DWC.term("")) and str(s) not in terms:
|
|
raise AssertionError("Term missing from terms_order.yaml: "+s)
|
|
print """All terms exist in both the graph and yaml"""
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
data=parseTerms()
|
|
buildHtml(data)
|
|
buildDownloads(data)
|