# Script to build Markdown pages that provide term metadata for simple vocabularies # Steve Baskauf 2020-06-28 CC0 # This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site # Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules import re import requests # best library to manage HTTP transactions import csv # library to read/write/parse CSV files import json # library to convert JSON to Python data structures import pandas as pd # ----------------- # Configuration section # ----------------- # !!!! Note !!!! # This is an example of a simple vocabulary without categories. For a complex example # with multiple namespaces and several categories, see build-page-categories.ipynb # This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example, # the branch is named "pathway" githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/' headerFileName = 'termlist-header.md' footerFileName = 'termlist-footer.md' outFileName = '../../docs/pw/index.md' # This is a Python list of the database names of the term lists to be included in the document. termLists = ['pathway'] # NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ # However, there probably aren't any circumstances where mixed types will be used to generate the same page. vocab_type = 3 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy # Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass # If so, those categories can be used to group terms in the generated term list document. organized_in_categories = False # If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass # If not organized into categories, the value is irrelevant. There just needs to be one item in the list. display_order = [''] display_label = ['Vocabulary'] # these are the section labels for the categories in the page display_comments = [''] # these are the comments about the category to be appended following the section labels display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories # --------------- # Function definitions # --------------- # replace URL with link # def createLinks(text): def repl(match): if match.group(1)[-1] == '.': return '' + match.group(1)[:-1] + '.' return '' + match.group(1) + '' pattern = '(https?://[^\s,;\)"]*)' result = re.sub(pattern, repl, text) return result # 2021-08-06 Replace the createLinks() function with functions copied from the QRG build script written by S. Van Hoey def convert_code(text_with_backticks): """Takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ... """ return re.sub(r'`([^`]*)`', r'\1', text_with_backticks) def convert_link(text_with_urls): """Takes all links in a text field and converts it to the html tagged version of the link """ def _handle_matched(inputstring): """quick hack version of url handling on the current prime versions data""" url = inputstring.group() return "{}".format(url, url) regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?\n' curie = row['pref_ns_prefix'] + ":" + row['term_localName'] curieAnchor = curie.replace(':','_') text += '\t\n' text += '\t\t\n' text += '\t\t\tTerm Name ' + curie + '\n' text += '\t\t\n' text += '\t\n' text += '\t\n' text += '\t\t\n' text += '\t\t\tTerm IRI\n' uri = row['pref_ns_uri'] + row['term_localName'] text += '\t\t\t' + uri + '\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\tModified\n' text += '\t\t\t' + row['term_modified'] + '\n' text += '\t\t\n' if row['version_iri'] != '': text += '\t\t\n' text += '\t\t\tTerm version IRI\n' text += '\t\t\t' + row['version_iri'] + '\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\tLabel\n' text += '\t\t\t' + row['label'] + '\n' text += '\t\t\n' if row['term_deprecated'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\tThis term is deprecated and should no longer be used.\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\tDefinition\n' text += '\t\t\t' + row['definition'] + '\n' text += '\t\t\n' if row['usage'] != '': text += '\t\t\n' text += '\t\t\tUsage\n' text += '\t\t\t' + convert_link(convert_code(row['usage'])) + '\n' text += '\t\t\n' if row['notes'] != '': text += '\t\t\n' text += '\t\t\tNotes\n' text += '\t\t\t' + convert_link(convert_code(row['notes'])) + '\n' text += '\t\t\n' if (vocab_type == 2 or vocab_type == 3) and row['controlled_value_string'] != '': # controlled vocabulary text += '\t\t\n' text += '\t\t\tControlled value\n' text += '\t\t\t' + row['controlled_value_string'] + '\n' text += '\t\t\n' if vocab_type == 3 and row['skos_broader'] != '': # controlled vocabulary with skos:broader relationships text += '\t\t\n' text += '\t\t\tHas broader concept\n' curieAnchor = row['skos_broader'].replace(':','_') text += '\t\t\t' + row['skos_broader'] + '\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\tType\n' if row['type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property': text += '\t\t\tProperty\n' elif row['type'] == 'http://www.w3.org/2000/01/rdf-schema#Class': text += '\t\t\tClass\n' elif row['type'] == 'http://www.w3.org/2004/02/skos/core#Concept': text += '\t\t\tConcept\n' else: text += '\t\t\t' + row['type'] + '\n' # this should rarely happen text += '\t\t\n' # Look up decisions related to this term for drow_index,drow in decisions_df.iterrows(): if drow['linked_affected_resource'] == uri: text += '\t\t\n' text += '\t\t\tExecutive Committee decision\n' text += '\t\t\thttp://rs.tdwg.org/decisions/' + drow['decision_localName'] + '\n' text += '\t\t\n' text += '\t\n' text += '\n' text += '\n' text += '\n' term_table = text text = index_by_label + term_table # read in header and footer, merge with terms table, and output headerObject = open(headerFileName, 'rt', encoding='utf-8') header = headerObject.read() headerObject.close() footerObject = open(footerFileName, 'rt', encoding='utf-8') footer = footerObject.read() footerObject.close() output = header + text + footer outputObject = open(outFileName, 'wt', encoding='utf-8') outputObject.write(output) outputObject.close() print('done')