{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Script to build Markdown pages that provide term metadata for simple vocabularies\n",
"# Steve Baskauf 2020-06-28 CC0\n",
"# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site\n",
"\n",
"# Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules\n",
"import re\n",
"import requests # best library to manage HTTP transactions\n",
"import csv # library to read/write/parse CSV files\n",
"import json # library to convert JSON to Python data structures\n",
"import pandas as pd\n",
"\n",
"# -----------------\n",
"# Configuration section\n",
"# -----------------\n",
"\n",
"# !!!! Note !!!!\n",
"# This is an example of a simple vocabulary without categories. For a complex example\n",
"# with multiple namespaces and several categories, see build-page-categories.ipynb\n",
"\n",
"# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,\n",
"# the branch is named \"pathway\"\n",
"githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/'\n",
"\n",
"headerFileName = 'termlist-header.md'\n",
"footerFileName = 'termlist-footer.md'\n",
"outFileName = '../../docs/pw/index.md'\n",
"\n",
"# This is a Python list of the database names of the term lists to be included in the document.\n",
"termLists = ['pathway']\n",
"\n",
"# NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ\n",
"# However, there probably aren't any circumstances where mixed types will be used to generate the same page.\n",
"vocab_type = 3 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy\n",
"\n",
"# Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass\n",
"# If so, those categories can be used to group terms in the generated term list document.\n",
"organized_in_categories = False\n",
"\n",
"# If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass\n",
"# If not organized into categories, the value is irrelevant. There just needs to be one item in the list.\n",
"display_order = ['']\n",
"display_label = ['Vocabulary'] # these are the section labels for the categories in the page\n",
"display_comments = [''] # these are the comments about the category to be appended following the section labels\n",
"display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories\n",
"\n",
"# ---------------\n",
"# Function definitions\n",
"# ---------------\n",
"\n",
"# replace URL with link\n",
"#\n",
"def createLinks(text):\n",
" def repl(match):\n",
" if match.group(1)[-1] == '.':\n",
" return '' + match.group(1)[:-1] + '.'\n",
" return '' + match.group(1) + ''\n",
"\n",
" pattern = '(https?://[^\\s,;\\)\"]*)'\n",
" result = re.sub(pattern, repl, text)\n",
" return result\n",
"\n",
"# 2021-08-06 Replace the createLinks() function with functions copied from the QRG build script written by S. Van Hoey\n",
"def convert_code(text_with_backticks):\n",
" \"\"\"Takes all back-quoted sections in a text field and converts it to\n",
" the html tagged version of code blocks ...
\n",
" \"\"\"\n",
" return re.sub(r'`([^`]*)`', r'\\1
', text_with_backticks)\n",
"\n",
"def convert_link(text_with_urls):\n",
" \"\"\"Takes all links in a text field and converts it to the html tagged\n",
" version of the link\n",
" \"\"\"\n",
" def _handle_matched(inputstring):\n",
" \"\"\"quick hack version of url handling on the current prime versions data\"\"\"\n",
" url = inputstring.group()\n",
" return \"{}\".format(url, url)\n",
"\n",
" regx = \"(http[s]?://[\\w\\d:#@%/;$()~_?\\+-;=\\\\\\.&]*)(?\\n'\n",
" curie = row['pref_ns_prefix'] + \":\" + row['term_localName']\n",
" curieAnchor = curie.replace(':','_')\n",
" text += '\\t\\n'\n",
" text += '\\t\\t\\n'\n",
" text += '\\t\\t\\t \\n'\n",
" text += '\\t\\n'\n",
" text += '\\tTerm Name ' + curie + ' \\n'\n",
" text += '\\t\\t