dwc/build/update_terms.py

#!/usr/bin/env python

''' 
easy_install pip
pip install -r requirements.txt
'''

import yaml, re, csv, codecs
from Cheetah.Template import Template
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import RDFS

DWC=Namespace("http://rs.tdwg.org/dwc/terms/")
DC=Namespace("http://purl.org/dc/terms/")
DWCA=Namespace("http://rs.tdwg.org/dwc/terms/attributes/")
REC_LEVEL=DWC.term("Record-level")


def buildHtml(groups):    
    print """building html files"""
    data={}
    data["groups"]=groups
    html = Template(file="terms.tmpl", searchList=[data])
    recommended = open("../terms/index.html", "w")
    recommended.write(str(html))
    recommended.close()    

def buildDownloads(groups):
    print """building dwc_terms.csv"""
    with open('../resources/dwc_terms.csv', 'w') as csvf:
        writer=csv.writer(csvf, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        writer.writerow(['TermName', 'URI', "Label_en", "Group", "Definition", "Comments"])
        for g in groups:
            for t in g["terms"]:
                writer.writerow([t["name"], t["uri"], utf8(t["label"]), t["class"], utf8(t["definition"]), utf8(t["comment"])])
    print """building simple_dwc_terms_list.csv"""
    with codecs.open('../resources/simple_dwc_terms_list.csv', 'w', 'utf-8') as f:
        for g in groups:
            for t in g["terms"]:
                f.write(t["name_simple"] + "\n")
    print """building simple_dwc.properties"""
    with codecs.open('../resources/simple_dwc.properties', 'w', 'utf-8') as f:
        for g in groups:
            for t in g["terms"]:
                term=t["name_simple"]
                f.write("%s.name=%s\n" % (term,term))
                f.write("%s.uri=%s\n" % (term,t["uri"]))
                f.write("%s.label=%s\n" % (term, n2e(t["label"])))
                f.write("%s.definition=%s\n" % (term, n2e(t["definition"])))
                f.write("%s.comment=%s\n" % (term, n2e(t["comment"])))
    print """building simple_dwc_terms_header.csv"""
    with codecs.open('../resources/simple_dwc_terms_header.csv', 'w', 'utf-8') as f:
        started=False
        for g in groups:
            for t in g["terms"]:
                if started:
                    f.write(",")
                f.write('"'+t["name_simple"]+'"')
                started=True
        f.write("\n")
    print """building simple_dwc_pgsql.sql"""
    with open('term_type.yaml', 'r') as f:
        types = yaml.load(f)
    with codecs.open('../resources/simple_dwc_pgsql.sql', 'w', 'utf-8') as f:
        started=False
        f.write("CREATE TABLE dwc (\n")
        for g in groups:
            for t in g["terms"]:
                if started:
                    f.write(",\n")
                f.write('  "%s" ' % t["name_simple"])
                f.write(types.get(t["name_simple"], "text"))
                started=True
        f.write("\n);\n")

def n2e(x):
    if x is None:
        return ""
    return x

def utf8(x):
    if x is None:
        return x
    return x.encode("utf-8")    

def anchorLinks(x):
    if x is None:
        return x
    return re.sub('(https?://\S+)', "<a href='\\1'>\\1</a>", x)
    
def getTermDef(name, g):
    t={}
    if name.startswith("DC_"):
        name=name[3:]
        t["name"]="dcterms:"+name
        t["name_simple"]=name
        t["name_prefixed"]=t["name"]
        t["fullname"]=t["name"]
        uri=DC.term(name)
    else:
        t["name"]=name
        t["name_simple"]=name
        t["name_prefixed"]="dwc:"+name
        uri=DWC.term(name)
        if uri==REC_LEVEL:
            uri=None
    t["uri"]=uri
    if uri is not None:
        t["label"]=g.value(subject=uri, predicate=RDFS.label)
        t["class"]=g.value(subject=uri, predicate=DWCA.organizedInClass)
        t["definition"]=anchorLinks(g.value(subject=uri, predicate=RDFS.comment))
        t["comment"]=anchorLinks(g.value(subject=uri, predicate=DC.description))
        t["version"]=g.value(subject=uri, predicate=DC.hasVersion)
        if t["definition"] is None:
            raise AssertionError("Unknown term definition "+str(uri))
    return t
    
def parseTerms():
    g = Graph()
    g.parse("../terms/dwc_normative.rdf")
    # we remove the abstract dwc term accordingTo
    g.remove((DWC.accordingTo,None,None))
    g.remove((DWC+"",None,None))
    if (DWC+"", None, None) in g:
        raise AssertionError("DWC NS in here")
    with open('dc.yaml', 'r') as dcf:
        dc = yaml.load(dcf)
    for t in dc:
        uri=DC[t]
        g.add( (uri, RDFS.comment, Literal(dc[t]["definition"])) )        
        g.add( (uri, DC.description, Literal(dc[t]["comment"])) )        
        g.add( (uri, DC.hasVersion, URIRef(dc[t]["details"])) )
    with open('term_order.yaml', 'r') as f:
        terms = yaml.load(f)
    data={}
    groups=[]
    for group in terms:
        groupTerm=getTermDef(sorted(group.keys())[0], g)
        groupTerm["terms"]=[]
        if group.values() is not None and sorted(group.values())[0] is not None:
            for t in sorted(group.values())[0]:
                groupTerm["terms"].append(getTermDef(t, g))            
        groups.append(groupTerm)
    # finally verify we have all terms covered in both the order yaml and the graph
    verifyCompleteness(g, groups)
    return groups

def verifyCompleteness(graph, groups):    
    terms={}
    for g in groups:
        if "uri" in g:
            terms[str(g["uri"])]=1
        for t in g["terms"]:
            terms[str(t["uri"])]=1
    print """%s terms defined""" % len(terms)
    for s in graph.subjects():
        if s not in (DWC.accordingTo, DWC.term("")) and str(s) not in terms:
            raise AssertionError("Term missing from terms_order.yaml: "+s)        
    print """All terms exist in both the graph and yaml"""


if __name__ == "__main__":
    data=parseTerms()
    buildHtml(data)
    buildDownloads(data)
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`#!/usr/bin/env python`

			`'''`
			`easy_install pip`
			`pip install -r requirements.txt`
			`'''`

Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`import yaml, re, csv, codecs`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`from Cheetah.Template import Template`
			`from rdflib import Graph, URIRef, Namespace, Literal`
			`from rdflib.namespace import RDFS`

			`DWC=Namespace("http://rs.tdwg.org/dwc/terms/")`
			`DC=Namespace("http://purl.org/dc/terms/")`
			`DWCA=Namespace("http://rs.tdwg.org/dwc/terms/attributes/")`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`REC_LEVEL=DWC.term("Record-level")`

finish dc.yaml and update script 2015-01-12 23:53:04 +00:00
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`def buildHtml(groups):`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`print """building html files"""`
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`data={}`
			`data["groups"]=groups`
Rename template to terms 2015-01-13 11:06:49 +00:00			`html = Template(file="terms.tmpl", searchList=[data])`
remove unused rdf styelsheets 2015-01-13 00:14:49 +00:00			`recommended = open("../terms/index.html", "w")`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`recommended.write(str(html))`
			`recommended.close()`

Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`def buildDownloads(groups):`
			`print """building dwc_terms.csv"""`
			`with open('../resources/dwc_terms.csv', 'w') as csvf:`
			`writer=csv.writer(csvf, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)`
			`writer.writerow(['TermName', 'URI', "Label_en", "Group", "Definition", "Comments"])`
			`for g in groups:`
			`for t in g["terms"]:`
			`writer.writerow([t["name"], t["uri"], utf8(t["label"]), t["class"], utf8(t["definition"]), utf8(t["comment"])])`
			`print """building simple_dwc_terms_list.csv"""`
			`with codecs.open('../resources/simple_dwc_terms_list.csv', 'w', 'utf-8') as f:`
			`for g in groups:`
			`for t in g["terms"]:`
			`f.write(t["name_simple"] + "\n")`
			`print """building simple_dwc.properties"""`
			`with codecs.open('../resources/simple_dwc.properties', 'w', 'utf-8') as f:`
			`for g in groups:`
			`for t in g["terms"]:`
			`term=t["name_simple"]`
			`f.write("%s.name=%s\n" % (term,term))`
			`f.write("%s.uri=%s\n" % (term,t["uri"]))`
			`f.write("%s.label=%s\n" % (term, n2e(t["label"])))`
			`f.write("%s.definition=%s\n" % (term, n2e(t["definition"])))`
			`f.write("%s.comment=%s\n" % (term, n2e(t["comment"])))`
			`print """building simple_dwc_terms_header.csv"""`
			`with codecs.open('../resources/simple_dwc_terms_header.csv', 'w', 'utf-8') as f:`
			`started=False`
			`for g in groups:`
			`for t in g["terms"]:`
			`if started:`
			`f.write(",")`
			`f.write('"'+t["name_simple"]+'"')`
			`started=True`
			`f.write("\n")`
			`print """building simple_dwc_pgsql.sql"""`
			`with open('term_type.yaml', 'r') as f:`
			`types = yaml.load(f)`
			`with codecs.open('../resources/simple_dwc_pgsql.sql', 'w', 'utf-8') as f:`
			`started=False`
			`f.write("CREATE TABLE dwc (\n")`
			`for g in groups:`
			`for t in g["terms"]:`
			`if started:`
			`f.write(",\n")`
			`f.write(' "%s" ' % t["name_simple"])`
			`f.write(types.get(t["name_simple"], "text"))`
			`started=True`
			`f.write("\n);\n")`

			`def n2e(x):`
			`if x is None:`
			`return ""`
			`return x`

			`def utf8(x):`
			`if x is None:`
			`return x`
			`return x.encode("utf-8")`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00
update terms reference layout according to https://github.com/tdwg/dwc/issues/68 2015-01-13 10:57:55 +00:00			`def anchorLinks(x):`
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`if x is None:`
			`return x`
			`return re.sub('(https?://\S+)', "<a href='\\1'>\\1</a>", x)`
update terms reference layout according to https://github.com/tdwg/dwc/issues/68 2015-01-13 10:57:55 +00:00
			`def getTermDef(name, g):`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`t={}`
			`if name.startswith("DC_"):`
			`name=name[3:]`
			`t["name"]="dcterms:"+name`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`t["name_simple"]=name`
			`t["name_prefixed"]=t["name"]`
update terms reference layout according to https://github.com/tdwg/dwc/issues/68 2015-01-13 10:57:55 +00:00			`t["fullname"]=t["name"]`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`uri=DC.term(name)`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`else:`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`t["name"]=name`
			`t["name_simple"]=name`
			`t["name_prefixed"]="dwc:"+name`
			`uri=DWC.term(name)`
			`if uri==REC_LEVEL:`
			`uri=None`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`t["uri"]=uri`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`if uri is not None:`
			`t["label"]=g.value(subject=uri, predicate=RDFS.label)`
			`t["class"]=g.value(subject=uri, predicate=DWCA.organizedInClass)`
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`t["definition"]=anchorLinks(g.value(subject=uri, predicate=RDFS.comment))`
			`t["comment"]=anchorLinks(g.value(subject=uri, predicate=DC.description))`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`t["version"]=g.value(subject=uri, predicate=DC.hasVersion)`
			`if t["definition"] is None:`
			`raise AssertionError("Unknown term definition "+str(uri))`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`return t`

			`def parseTerms():`
			`g = Graph()`
			`g.parse("../terms/dwc_normative.rdf")`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`# we remove the abstract dwc term accordingTo`
			`g.remove((DWC.accordingTo,None,None))`
			`g.remove((DWC+"",None,None))`
			`if (DWC+"", None, None) in g:`
			`raise AssertionError("DWC NS in here")`
			`with open('dc.yaml', 'r') as dcf:`
			`dc = yaml.load(dcf)`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`for t in dc:`
			`uri=DC[t]`
			`g.add( (uri, RDFS.comment, Literal(dc[t]["definition"])) )`
			`g.add( (uri, DC.description, Literal(dc[t]["comment"])) )`
			`g.add( (uri, DC.hasVersion, URIRef(dc[t]["details"])) )`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`with open('term_order.yaml', 'r') as f:`
			`terms = yaml.load(f)`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00			`data={}`
			`groups=[]`
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`for group in terms:`
			`groupTerm=getTermDef(sorted(group.keys())[0], g)`
			`groupTerm["terms"]=[]`
			`if group.values() is not None and sorted(group.values())[0] is not None:`
			`for t in sorted(group.values())[0]:`
			`groupTerm["terms"].append(getTermDef(t, g))`
			`groups.append(groupTerm)`
			`# finally verify we have all terms covered in both the order yaml and the graph`
			`verifyCompleteness(g, groups)`
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`return groups`
finish dc.yaml and update script 2015-01-12 23:53:04 +00:00
update yaml and implement graph/yaml consistency check. Add local bootstrap assets 2015-01-13 13:33:37 +00:00			`def verifyCompleteness(graph, groups):`
			`terms={}`
			`for g in groups:`
			`if "uri" in g:`
			`terms[str(g["uri"])]=1`
			`for t in g["terms"]:`
			`terms[str(t["uri"])]=1`
			`print """%s terms defined""" % len(terms)`
			`for s in graph.subjects():`
			`if s not in (DWC.accordingTo, DWC.term("")) and str(s) not in terms:`
			`raise AssertionError("Term missing from terms_order.yaml: "+s)`
			`print """All terms exist in both the graph and yaml"""`


finish dc.yaml and update script 2015-01-12 23:53:04 +00:00

			`if __name__ == "__main__":`
Adding generation of resources files to update script 2015-01-13 15:58:50 +00:00			`data=parseTerms()`
			`buildHtml(data)`
			`buildDownloads(data)`