# Script to build Markdown pages that provide term metadata for simple vocabularies
# Steve Baskauf 2020-06-28 CC0
# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site
# Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules
import re
import requests # best library to manage HTTP transactions
import csv # library to read/write/parse CSV files
import json # library to convert JSON to Python data structures
import pandas as pd
# -----------------
# Configuration section
# -----------------
# !!!! Note !!!!
# This is an example of a simple vocabulary without categories. For a complex example
# with multiple namespaces and several categories, see build-page-categories.ipynb
# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,
# the branch is named "pathway"
githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/'
headerFileName = 'termlist-header.md'
footerFileName = 'termlist-footer.md'
outFileName = '../../docs/pw/index.md'
# This is a Python list of the database names of the term lists to be included in the document.
termLists = ['pathway']
# NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ
# However, there probably aren't any circumstances where mixed types will be used to generate the same page.
vocab_type = 3 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy
# Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass
# If so, those categories can be used to group terms in the generated term list document.
organized_in_categories = False
# If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass
# If not organized into categories, the value is irrelevant. There just needs to be one item in the list.
display_order = ['']
display_label = ['Vocabulary'] # these are the section labels for the categories in the page
display_comments = [''] # these are the comments about the category to be appended following the section labels
display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories
# ---------------
# Function definitions
# ---------------
# replace URL with link
#
def createLinks(text):
def repl(match):
if match.group(1)[-1] == '.':
return '' + match.group(1)[:-1] + '.'
return '' + match.group(1) + ''
pattern = '(https?://[^\s,;\)"]*)'
result = re.sub(pattern, repl, text)
return result
# 2021-08-06 Replace the createLinks() function with functions copied from the QRG build script written by S. Van Hoey
def convert_code(text_with_backticks):
"""Takes all back-quoted sections in a text field and converts it to
the html tagged version of code blocks ...
"""
return re.sub(r'`([^`]*)`', r'\1
', text_with_backticks)
def convert_link(text_with_urls):
"""Takes all links in a text field and converts it to the html tagged
version of the link
"""
def _handle_matched(inputstring):
"""quick hack version of url handling on the current prime versions data"""
url = inputstring.group()
return "{}".format(url, url)
regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?\n'
curie = row['pref_ns_prefix'] + ":" + row['term_localName']
curieAnchor = curie.replace(':','_')
text += '\t\n'
text += '\t\t\n'
text += '\t\t\t \n'
text += '\t\n'
text += '\tTerm Name ' + curie + ' \n'
text += '\t\t