{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Script to build Markdown pages that provide term metadata for simple vocabularies\n", "# Steve Baskauf 2020-06-28 CC0\n", "# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site\n", "\n", "# Note: this script calls a function from http_library.py, which requires importing the requests, csv, and json modules\n", "import re\n", "import requests # best library to manage HTTP transactions\n", "import csv # library to read/write/parse CSV files\n", "import json # library to convert JSON to Python data structures\n", "import pandas as pd\n", "\n", "# -----------------\n", "# Configuration section\n", "# -----------------\n", "\n", "# !!!! Note !!!!\n", "# This is an example of a simple vocabulary without categories. For a complex example\n", "# with multiple namespaces and several categories, see build-page-categories.ipynb\n", "\n", "# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,\n", "# the branch is named \"pathway\"\n", "githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/'\n", "\n", "headerFileName = 'termlist-header.md'\n", "footerFileName = 'termlist-footer.md'\n", "outFileName = '../../docs/pw/index.md'\n", "\n", "# This is a Python list of the database names of the term lists to be included in the document.\n", "termLists = ['pathway']\n", "\n", "# NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ\n", "# However, there probably aren't any circumstances where mixed types will be used to generate the same page.\n", "vocab_type = 3 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy\n", "\n", "# Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass\n", "# If so, those categories can be used to group terms in the generated term list document.\n", "organized_in_categories = False\n", "\n", "# If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass\n", "# If not organized into categories, the value is irrelevant. There just needs to be one item in the list.\n", "display_order = ['']\n", "display_label = ['Vocabulary'] # these are the section labels for the categories in the page\n", "display_comments = [''] # these are the comments about the category to be appended following the section labels\n", "display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories\n", "\n", "# ---------------\n", "# Function definitions\n", "# ---------------\n", "\n", "# replace URL with link\n", "#\n", "def createLinks(text):\n", " def repl(match):\n", " if match.group(1)[-1] == '.':\n", " return '' + match.group(1)[:-1] + '.'\n", " return '' + match.group(1) + ''\n", "\n", " pattern = '(https?://[^\\s,;\\)\"]*)'\n", " result = re.sub(pattern, repl, text)\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "term_lists_info = []\n", "\n", "frame = pd.read_csv(githubBaseUri + 'term-lists/term-lists.csv', na_filter=False)\n", "for termList in termLists:\n", " term_list_dict = {'list_iri': termList}\n", " term_list_dict = {'database': termList}\n", " for index,row in frame.iterrows():\n", " if row['database'] == termList:\n", " term_list_dict['pref_ns_prefix'] = row['vann_preferredNamespacePrefix']\n", " term_list_dict['pref_ns_uri'] = row['vann_preferredNamespaceUri']\n", " term_list_dict['list_iri'] = row['list']\n", " term_lists_info.append(term_list_dict)\n", "print(term_lists_info)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create column list\n", "column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'definition', 'usage', 'notes', 'term_modified', 'term_deprecated', 'type']\n", "if vocab_type == 2:\n", " column_list += ['controlled_value_string']\n", "elif vocab_type == 3:\n", " column_list += ['controlled_value_string', 'skos_broader']\n", "if organized_in_categories:\n", " column_list.append('tdwgutility_organizedInClass')\n", "column_list.append('version_iri')\n", "\n", "# Create list of lists metadata table\n", "table_list = []\n", "for term_list in term_lists_info:\n", " # retrieve versions metadata for term list\n", " versions_url = githubBaseUri + term_list['database'] + '-versions/' + term_list['database'] + '-versions.csv'\n", " versions_df = pd.read_csv(versions_url, na_filter=False)\n", " \n", " # retrieve current term metadata for term list\n", " data_url = githubBaseUri + term_list['database'] + '/' + term_list['database'] + '.csv'\n", " frame = pd.read_csv(data_url, na_filter=False)\n", " for index,row in frame.iterrows():\n", " row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['definition'], row['usage'], row['notes'], row['term_modified'], row['term_deprecated'], row['type']]\n", " if vocab_type == 2:\n", " row_list += [row['controlled_value_string']]\n", " elif vocab_type == 3:\n", " if row['skos_broader'] =='':\n", " row_list += [row['controlled_value_string'], '']\n", " else:\n", " row_list += [row['controlled_value_string'], term_list['pref_ns_prefix'] + ':' + row['skos_broader']]\n", " if organized_in_categories:\n", " row_list.append(row['tdwgutility_organizedInClass'])\n", "\n", " # Borrowed terms really don't have implemented versions. They may be lacking values for version_status.\n", " # In their case, their version IRI will be omitted.\n", " found = False\n", " for vindex, vrow in versions_df.iterrows():\n", " if vrow['term_localName']==row['term_localName'] and vrow['version_status']=='recommended':\n", " found = True\n", " version_iri = vrow['version']\n", " # NOTE: the current hack for non-TDWG terms without a version is to append # to the end of the term IRI\n", " if version_iri[len(version_iri)-1] == '#':\n", " version_iri = ''\n", " if not found:\n", " version_iri = ''\n", " row_list.append(version_iri)\n", "\n", " table_list.append(row_list)\n", "\n", "# Turn list of lists into dataframe\n", "terms_df = pd.DataFrame(table_list, columns = column_list)\n", "\n", "terms_sorted_by_label = terms_df.sort_values(by='label')\n", "terms_sorted_by_localname = terms_df.sort_values(by='term_localName')\n", "terms_sorted_by_label" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the following cell to generate an index sorted alphabetically by lowercase term local name. Omit this index if the terms have opaque local names." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# generate the index of terms grouped by category and sorted alphabetically by lowercase term local name\n", "\n", "text = '### 3.1 Index By Term Name\\n\\n'\n", "text += '(See also [3.2 Index By Label](#32-index-by-label))\\n\\n'\n", "for category in range(0,len(display_order)):\n", " text += '**' + display_label[category] + '**\\n'\n", " text += '\\n'\n", " if organized_in_categories:\n", " filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]\n", " filtered_table.reset_index(drop=True, inplace=True)\n", " else:\n", " filtered_table = terms_sorted_by_localname\n", " filtered_table.reset_index(drop=True, inplace=True)\n", " \n", " for row_index,row in filtered_table.iterrows():\n", " curie = row['pref_ns_prefix'] + \":\" + row['term_localName']\n", " curie_anchor = curie.replace(':','_')\n", " text += '[' + curie + '](#' + curie_anchor + ')'\n", " if row_index < len(filtered_table) - 1:\n", " text += ' |'\n", " text += '\\n'\n", " text += '\\n'\n", "index_by_name = text\n", "\n", "print(index_by_name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the following cell to generate an index by term label" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = '\\n\\n'\n", "\n", "# Comment out the following two lines if there is no index by local names\n", "#text = '### 3.2 Index By Label\\n\\n'\n", "#text += '(See also [3.1 Index By Term Name](#31-index-by-term-name))\\n\\n'\n", "for category in range(0,len(display_order)):\n", " if organized_in_categories:\n", " text += '**' + display_label[category] + '**\\n'\n", " text += '\\n'\n", " filtered_table = terms_sorted_by_label[terms_sorted_by_label['tdwgutility_organizedInClass']==display_order[category]]\n", " filtered_table.reset_index(drop=True, inplace=True)\n", " else:\n", " filtered_table = terms_sorted_by_label\n", " filtered_table.reset_index(drop=True, inplace=True)\n", " \n", " for row_index,row in filtered_table.iterrows():\n", " if row_index == 0 or (row_index != 0 and row['label'] != filtered_table.iloc[row_index - 1].loc['label']): # this is a hack to prevent duplicate labels\n", " curie_anchor = row['pref_ns_prefix'] + \"_\" + row['term_localName']\n", " text += '[' + row['label'] + '](#' + curie_anchor + ')'\n", " if row_index < len(filtered_table) - 2 or (row_index == len(filtered_table) - 2 and row['label'] != filtered_table.iloc[row_index + 1].loc['label']):\n", " text += ' |'\n", " text += '\\n'\n", " text += '\\n'\n", "index_by_label = text\n", "\n", "print(index_by_label)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decisions_df = pd.read_csv('https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/decisions/decisions-links.csv', na_filter=False)\n", "\n", "# generate a table for each term, with terms grouped by category\n", "\n", "# generate the Markdown for the terms table\n", "text = '## 4 Vocabulary\\n'\n", "for category in range(0,len(display_order)):\n", " if organized_in_categories:\n", " text += '### 4.' + str(category + 1) + ' ' + display_label[category] + '\\n'\n", " text += '\\n'\n", " text += display_comments[category] # insert the comments for the category, if any.\n", " filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]\n", " filtered_table.reset_index(drop=True, inplace=True)\n", " else:\n", " filtered_table = terms_sorted_by_localname\n", " filtered_table.reset_index(drop=True, inplace=True)\n", "\n", " for row_index,row in filtered_table.iterrows():\n", " text += '\\n'\n", " curie = row['pref_ns_prefix'] + \":\" + row['term_localName']\n", " curieAnchor = curie.replace(':','_')\n", " text += '\\t\\n'\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", " text += '\\t\\n'\n", " text += '\\t\\n'\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " uri = row['pref_ns_uri'] + row['term_localName']\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if row['version_iri'] != '':\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if row['term_deprecated'] != '':\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if row['usage'] != '':\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if row['notes'] != '':\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if (vocab_type == 2 or vocab_type == 3) and row['controlled_value_string'] != '': # controlled vocabulary\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " if vocab_type == 3 and row['skos_broader'] != '': # controlled vocabulary with skos:broader relationships\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " curieAnchor = row['skos_broader'].replace(':','_')\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n'\n", "\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " if row['type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property':\n", " text += '\\t\\t\\t\\n'\n", " elif row['type'] == 'http://www.w3.org/2000/01/rdf-schema#Class':\n", " text += '\\t\\t\\t\\n'\n", " elif row['type'] == 'http://www.w3.org/2004/02/skos/core#Concept':\n", " text += '\\t\\t\\t\\n'\n", " else:\n", " text += '\\t\\t\\t\\n' # this should rarely happen\n", " text += '\\t\\t\\n'\n", "\n", " # Look up decisions related to this term\n", " for drow_index,drow in decisions_df.iterrows():\n", " if drow['linked_affected_resource'] == uri:\n", " text += '\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\t\\n'\n", " text += '\\t\\t\\n' \n", "\n", " text += '\\t\\n'\n", " text += '
Term Name ' + curie + '
Term IRI' + uri + '
Modified' + row['term_modified'] + '
Term version IRI' + row['version_iri'] + '
Label' + row['label'] + '
This term is deprecated and should no longer be used.
Definition' + row['definition'] + '
Usage' + createLinks(row['usage']) + '
Notes' + createLinks(row['notes']) + '
Controlled value' + row['controlled_value_string'] + '
Has broader concept' + row['skos_broader'] + '
TypePropertyClassConcept' + row['type'] + '
Executive Committee decisionhttp://rs.tdwg.org/decisions/' + drow['decision_localName'] + '
\\n'\n", " text += '\\n'\n", " text += '\\n'\n", "term_table = text\n", "\n", "print(term_table)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modify to display the indices that you want" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = index_by_label + term_table\n", "#text = index_by_name + index_by_label + term_table" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read in header and footer, merge with terms table, and output\n", "\n", "headerObject = open(headerFileName, 'rt', encoding='utf-8')\n", "header = headerObject.read()\n", "headerObject.close()\n", "\n", "footerObject = open(footerFileName, 'rt', encoding='utf-8')\n", "footer = footerObject.read()\n", "footerObject.close()\n", "\n", "output = header + text + footer\n", "outputObject = open(outFileName, 'wt', encoding='utf-8')\n", "outputObject.write(output)\n", "outputObject.close()\n", " \n", "print('done')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }