dwc/build/update_terms.py

#!/usr/bin/env python

'''
Install the required libraries for this script with PIP:
$ easy_install pip
$ sudo pip install -r requirements.txt

To run the script just run it from inside the build folder:
$ ./update_terms.py
'''

import yaml, re, csv, codecs
from Cheetah.Template import Template
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import RDFS

DWC=Namespace("http://rs.tdwg.org/dwc/terms/")
DC=Namespace("http://purl.org/dc/terms/")
DWCA=Namespace("http://rs.tdwg.org/dwc/terms/attributes/")
REC_LEVEL=DWC.term("Record-level")


def buildHtml(groups):
    print """building html files"""
    data={}
    data["groups"]=groups
    html = Template(file="terms.tmpl", searchList=[data])
    recommended = open("../terms/index.html", "w")
    recommended.write(str(html))
    recommended.close()

def buildDownloads(groups):
    print """building dwc_terms.csv"""
    with open('../resources/dwc_terms.csv', 'w') as csvf:
        writer=csv.writer(csvf, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        writer.writerow(['TermName', 'URI', "Label_en", "Group", "Definition", "Comments"])
        for g in groups:
            for t in g["terms"]:
                writer.writerow([t["name"], t["uri"], utf8(t["label"]), t["class"], utf8(t["definition"]), utf8(t["comment"])])
    print """building simple_dwc_terms_list.csv"""
    with codecs.open('../resources/simple_dwc_terms_list.csv', 'w', 'utf-8') as f:
        for g in groups:
            for t in g["terms"]:
                f.write(t["name"] + "\n")
    print """building simple_dwc.properties"""
    with codecs.open('../resources/simple_dwc.properties', 'w', 'utf-8') as f:
        for g in groups:
            for t in g["terms"]:
                term=t["name"]
                f.write("%s.name=%s\n" % (term,term))
                f.write("%s.uri=%s\n" % (term,t["uri"]))
                f.write("%s.label=%s\n" % (term, n2e(t["label"])))
                f.write("%s.definition=%s\n" % (term, n2e(t["definition"])))
                f.write("%s.comment=%s\n" % (term, n2e(t["comment"])))
    print """building simple_dwc_terms_header.csv"""
    with codecs.open('../resources/simple_dwc_terms_header.csv', 'w', 'utf-8') as f:
        started=False
        for g in groups:
            for t in g["terms"]:
                if started:
                    f.write(",")
                f.write('"'+t["name"]+'"')
                started=True
        f.write("\n")
    print """building simple_dwc_pgsql.sql"""
    with open('term_type.yaml', 'r') as f:
        types = yaml.load(f)
    with codecs.open('../resources/simple_dwc_pgsql.sql', 'w', 'utf-8') as f:
        started=False
        f.write("CREATE TABLE dwc (\n")
        for g in groups:
            for t in g["terms"]:
                if started:
                    f.write(",\n")
                f.write('  "%s" ' % t["name"])
                f.write(types.get(t["name"], "text"))
                started=True
        f.write("\n);\n")

def n2e(x):
    if x is None:
        return ""
    return x

def utf8(x):
    if x is None:
        return x
    return x.encode("utf-8")

def anchorLinks(x):
    if x is None:
        return x
    return re.sub('(https?://\S+[a-zA-Z0-9/_-])', "<a href='\\1'>\\1</a>", x)

def getTermDef(name, g):
    t={}
    if name.startswith("DC_"):
        name=name[3:]
        t["name"]=name
        t["name_prefixed"]="dcterms:"+name
        uri=DC.term(name)
    else:
        t["name"]=name
        t["name_prefixed"]="dwc:"+name
        uri=DWC.term(name)
        if uri==REC_LEVEL:
            uri=None
    t["uri"]=uri
    if uri is not None:
        t["label"]=g.value(subject=uri, predicate=RDFS.label)
        t["class"]=g.value(subject=uri, predicate=DWCA.organizedInClass)
        t["definition"]=anchorLinks(g.value(subject=uri, predicate=RDFS.comment))
        t["comment"]=anchorLinks(g.value(subject=uri, predicate=DC.description))
        t["version"]=g.value(subject=uri, predicate=DC.hasVersion)
        if t["definition"] is None:
            raise AssertionError("Unknown term definition "+str(uri))
    return t

def parseTerms():
    g = Graph()
    g.parse("../terms/dwc_normative.rdf")
    # we remove the abstract dwc term accordingTo
    g.remove((DWC.accordingTo,None,None))
    g.remove((DWC+"",None,None))
    if (DWC+"", None, None) in g:
        raise AssertionError("DWC NS in here")
    with open('dc.yaml', 'r') as dcf:
        dc = yaml.load(dcf)
    for t in dc:
        uri=DC[t]
        g.add( (uri, RDFS.comment, Literal(dc[t]["definition"])) )
        g.add( (uri, DC.description, Literal(dc[t]["comment"])) )
        g.add( (uri, DC.hasVersion, URIRef(dc[t]["details"])) )
    with open('term_order.yaml', 'r') as f:
        terms = yaml.load(f)
    data={}
    groups=[]
    for group in terms:
        groupTerm=getTermDef(sorted(group.keys())[0], g)
        groupTerm["terms"]=[]
        if group.values() is not None and sorted(group.values())[0] is not None:
            for t in sorted(group.values())[0]:
                groupTerm["terms"].append(getTermDef(t, g))
        groups.append(groupTerm)
    # finally verify we have all terms covered in both the order yaml and the graph
    verifyCompleteness(g, groups)
    return groups

def verifyCompleteness(graph, groups):
    terms={}
    for g in groups:
        if "uri" in g:
            terms[str(g["uri"])]=1
        for t in g["terms"]:
            terms[str(t["uri"])]=1
    print """%s terms defined""" % len(terms)
    for s in graph.subjects():
        if s not in (DWC.accordingTo, DWC.term("")) and str(s) not in terms:
            raise AssertionError("Term missing from terms_order.yaml: "+s)
    print """All terms exist in both the graph and yaml"""


if __name__ == "__main__":
    data=parseTerms()
    buildHtml(data)
    buildDownloads(data)