# Script to build Markdown pages that provide term metadata for complex vocabularies # Steve Baskauf 2020-08-12 CC0 # updated 2021-02-11 # This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site import re import requests # best library to manage HTTP transactions import csv # library to read/write/parse CSV files import json # library to convert JSON to Python data structures import pandas as pd import yaml # ----------------- # Configuration section # ----------------- # !!!! NOTE !!!! # There is not currently an example of a complex vocabulary that has the column headers # used in the sample files. In order to test this script, it uses the Audubon Core files, # which have headers that differ from the samples. So throughout the code, there are # pairs of lines where the default header names are commented out and the Audubon Core # headers are not. To build a page using the sample files, you will need to reverse the # commenting of these pairs. github_branch = 'master' # "master" for production, something else for development # This is the base URL for raw files from the branch of the repo that has been pushed to GitHub githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/' + github_branch + '/' headerFileName = 'termlist-header.md' footerFileName = 'termlist-footer.md' outFileName = '../docs/list/index.md' # This is a Python list of the database names of the term lists to be included in the document. termLists = ['terms', 'iri', 'dc-for-dwc', 'dcterms-for-dwc'] #termLists = ['pathway'] # If this list of terms is for terms in a single namespace, set the value of has_namespace to True. The value # of has_namespace should be False for a list of terms that contains multiple namespaces. has_namespace = False # NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ # However, there probably aren't any circumstances where mixed types will be used to generate the same page. vocab_type = 1 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy # Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass # If so, those categories can be used to group terms in the generated term list document. organized_in_categories = True # If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass # If not organized into categories, the value is irrelevant. There just needs to be one item in the list. display_order = ['', 'http://purl.org/dc/elements/1.1/', 'http://purl.org/dc/terms/', 'http://rs.tdwg.org/dwc/terms/Occurrence', 'http://rs.tdwg.org/dwc/terms/Organism', 'http://rs.tdwg.org/dwc/terms/MaterialEntity', 'http://rs.tdwg.org/dwc/terms/MaterialSample', 'http://rs.tdwg.org/dwc/terms/Event', 'http://purl.org/dc/terms/Location', 'http://rs.tdwg.org/dwc/terms/GeologicalContext', 'http://rs.tdwg.org/dwc/terms/Identification', 'http://rs.tdwg.org/dwc/terms/Taxon', 'http://rs.tdwg.org/dwc/terms/MeasurementOrFact', 'http://rs.tdwg.org/dwc/terms/ResourceRelationship', 'http://rs.tdwg.org/dwc/terms/attributes/UseWithIRI'] display_label = ['Record level', 'Dublin Core legacy namespace', 'Dublin Core terms namespace', 'Occurrence', 'Organism', 'Material Entity', 'Material Sample', 'Event', 'Location', 'Geological Context', 'Identification', 'Taxon', 'Measurement or Fact', 'Resource Relationship', 'IRI-value terms'] display_comments = ['','','','','','','','','','','','','','',''] display_id = ['record_level', 'dc', 'dcterms', 'occurrence', 'organism', 'material_entity', 'material_sample', 'event', 'location', 'geological_context', 'identification', 'taxon', 'measurement_or_fact', 'resource_relationship', 'use_with_iri'] #display_order = [''] #display_label = ['Vocabulary'] # these are the section labels for the categories in the page #display_comments = [''] # these are the comments about the category to be appended following the section labels #display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories # --------------- # Load header data # --------------- config_file_path = 'process/document_metadata_processing/dwc_doc_list/' contributors_yaml_file = 'authors_configuration.yaml' document_configuration_yaml_file = 'document_configuration.yaml' if has_namespace: # Load the configuration file used in the metadata creation process. metadata_config_text = requests.get(githubBaseUri + 'process/config.yaml').text metadata_config = yaml.load(metadata_config_text, Loader=yaml.FullLoader) namespace_uri = metadata_config['namespaces'][0]['namespace_uri'] pref_namespace_prefix = metadata_config['namespaces'][0]['pref_namespace_prefix'] # Load the contributors YAML file from its GitHub URL contributors_yaml_url = githubBaseUri + config_file_path + contributors_yaml_file contributors_yaml = requests.get(contributors_yaml_url).text if contributors_yaml == '404: Not Found': print('Contributors YAML file not found. Check the URL.') print(contributors_yaml_url) exit() contributors_yaml = yaml.load(contributors_yaml, Loader=yaml.FullLoader) # Load the document configuration YAML file from its GitHub URL document_configuration_yaml_url = githubBaseUri + config_file_path + document_configuration_yaml_file document_configuration_yaml = requests.get(document_configuration_yaml_url).text document_configuration_yaml = yaml.load(document_configuration_yaml, Loader=yaml.FullLoader) # --------------- # Function definitions # --------------- # replace URL with link (function used with Audubon Core list of terms build script) # Does not correctly handle URLs with close parens ) characters, so no longer used. # def createLinks(text): def repl(match): if match.group(1)[-1] == '.': return '' + match.group(1)[:-1] + '.' return '' + match.group(1) + '' pattern = '(https?://[^\s,;\)"<]*)' result = re.sub(pattern, repl, text) return result # 2021-08-06 Replace the createLinks() function with functions copied from the QRG build script written by S. Van Hoey def convert_code(text_with_backticks): """Takes all back-quoted sections in a text field and converts it to the html tagged version of code blocks ... """ return re.sub(r'`([^`]*)`', r'\1', text_with_backticks) def convert_link(text_with_urls): """Takes all links in a text field and converts it to the html tagged version of the link """ def _handle_matched(inputstring): """quick hack version of url handling on the current prime versions data""" url = inputstring.group() return "{}".format(url, url) regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?{% for example in examples %}
  • {{ example }}
  • {% endfor %}{% endif %} def convert_examples(text_with_list_of_examples: str) -> str: examples_list = text_with_list_of_examples.split('; ') if len(examples_list) == 1: return examples_list[0] else: output = '' return output # --------------- # Retrieve term list metadata from GitHub # --------------- print('Retrieving term list metadata from GitHub') term_lists_info = [] frame = pd.read_csv(githubBaseUri + 'term-lists/term-lists.csv', na_filter=False) for termList in termLists: term_list_dict = {'list_iri': termList} term_list_dict = {'database': termList} for index,row in frame.iterrows(): if row['database'] == termList: term_list_dict['pref_ns_prefix'] = row['vann_preferredNamespacePrefix'] term_list_dict['pref_ns_uri'] = row['vann_preferredNamespaceUri'] term_list_dict['list_iri'] = row['list'] term_lists_info.append(term_list_dict) print(term_lists_info) print() # --------------- # Create metadata table and populate using data from namespace databases in GitHub # --------------- # Create column list column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'rdfs_comment', 'dcterms_description', 'examples', 'term_modified', 'term_deprecated', 'rdf_type', 'tdwgutility_abcdEquivalence', 'replaces_term', 'replaces1_term'] #column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'definition', 'usage', 'notes', 'term_modified', 'term_deprecated', 'type'] if vocab_type == 2: column_list += ['controlled_value_string'] elif vocab_type == 3: column_list += ['controlled_value_string', 'skos_broader'] if organized_in_categories: column_list.append('tdwgutility_organizedInClass') column_list.append('version_iri') print('Retrieving metadata about terms from all namespaces from GitHub') # Create list of lists metadata table table_list = [] for term_list in term_lists_info: # retrieve versions metadata for term list versions_url = githubBaseUri + term_list['database'] + '-versions/' + term_list['database'] + '-versions.csv' versions_df = pd.read_csv(versions_url, na_filter=False) # retrieve current term metadata for term list data_url = githubBaseUri + term_list['database'] + '/' + term_list['database'] + '.csv' frame = pd.read_csv(data_url, na_filter=False) for index,row in frame.iterrows(): row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['rdfs_comment'], row['dcterms_description'], row['examples'], row['term_modified'], row['term_deprecated'], row['rdf_type'], row['tdwgutility_abcdEquivalence'], row['replaces_term'], row['replaces1_term']] #row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['definition'], row['usage'], row['notes'], row['term_modified'], row['term_deprecated'], row['type']] if vocab_type == 2: row_list += [row['controlled_value_string']] elif vocab_type == 3: if row['skos_broader'] =='': row_list += [row['controlled_value_string'], ''] else: row_list += [row['controlled_value_string'], term_list['pref_ns_prefix'] + ':' + row['skos_broader']] if organized_in_categories: row_list.append(row['tdwgutility_organizedInClass']) # Borrowed terms really don't have implemented versions. They may be lacking values for version_status. # In their case, their version IRI will be omitted. found = False for vindex, vrow in versions_df.iterrows(): if vrow['term_localName']==row['term_localName'] and vrow['version_status']=='recommended': found = True version_iri = vrow['version'] # NOTE: the current hack for non-TDWG terms without a version is to append # to the end of the term IRI if version_iri[len(version_iri)-1] == '#': version_iri = '' if not found: version_iri = '' row_list.append(version_iri) table_list.append(row_list) print('processing data') # Turn list of lists into dataframe terms_df = pd.DataFrame(table_list, columns = column_list) terms_sorted_by_label = terms_df.sort_values(by='label') #terms_sorted_by_localname = terms_df.sort_values(by='term_localName') # This makes sort case insensitive terms_sorted_by_localname = terms_df.iloc[terms_df.term_localName.str.lower().argsort()] #terms_sorted_by_localname print('done retrieving') print() # --------------- # generate the index of terms grouped by category and sorted alphabetically by lowercase term local name # --------------- print('Generating term index by CURIE') text = '### 3.1 Index By Term Name\n\n' text += '(See also [3.2 Index By Label](#32-index-by-label))\n\n' text += '**Classes**\n' text += '\n' for row_index,row in terms_sorted_by_localname.iterrows(): if row['rdf_type'] == 'http://www.w3.org/2000/01/rdf-schema#Class': curie = row['pref_ns_prefix'] + ":" + row['term_localName'] curie_anchor = curie.replace(':','_') text += '[' + curie + '](#' + curie_anchor + ') |\n' text = text[:len(text)-2] # remove final trailing vertical bar and newline text += '\n\n' # put back removed newline for category in range(0,len(display_order)): text += '**' + display_label[category] + '**\n' text += '\n' if organized_in_categories: filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]] filtered_table.reset_index(drop=True, inplace=True) else: filtered_table = terms_sorted_by_localname for row_index,row in filtered_table.iterrows(): if row['rdf_type'] != 'http://www.w3.org/2000/01/rdf-schema#Class': curie = row['pref_ns_prefix'] + ":" + row['term_localName'] curie_anchor = curie.replace(':','_') text += '[' + curie + '](#' + curie_anchor + ') |\n' text = text[:len(text)-2] # remove final trailing vertical bar and newline text += '\n\n' # put back removed newline index_by_name = text #print(index_by_name) print() # --------------- # generate the index of terms by label # --------------- print('Generating term index by label') text = '\n\n' # Comment out the following two lines if there is no index by local names text = '### 3.2 Index By Label\n\n' text += '(See also [3.1 Index By Term Name](#31-index-by-term-name))\n\n' text += '**Classes**\n' text += '\n' for row_index,row in terms_sorted_by_label.iterrows(): if row['rdf_type'] == 'http://www.w3.org/2000/01/rdf-schema#Class': curie_anchor = row['pref_ns_prefix'] + "_" + row['term_localName'] text += '[' + row['label'] + '](#' + curie_anchor + ') |\n' text = text[:len(text)-2] # remove final trailing vertical bar and newline text += '\n\n' # put back removed newline for category in range(0,len(display_order)): if organized_in_categories: text += '**' + display_label[category] + '**\n' text += '\n' filtered_table = terms_sorted_by_label[terms_sorted_by_label['tdwgutility_organizedInClass']==display_order[category]] filtered_table.reset_index(drop=True, inplace=True) else: filtered_table = terms_sorted_by_label for row_index,row in filtered_table.iterrows(): if row_index == 0 or (row_index != 0 and row['label'] != filtered_table.iloc[row_index - 1].loc['label']): # this is a hack to prevent duplicate labels if row['rdf_type'] != 'http://www.w3.org/2000/01/rdf-schema#Class': curie_anchor = row['pref_ns_prefix'] + "_" + row['term_localName'] text += '[' + row['label'] + '](#' + curie_anchor + ') |\n' text = text[:len(text)-2] # remove final trailing vertical bar and newline text += '\n\n' # put back removed newline index_by_label = text print() #print(index_by_label) decisions_df = pd.read_csv('https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/decisions/decisions-links.csv', na_filter=False) # --------------- # generate a table for each term, with terms grouped by category # --------------- print('Generating terms table') # generate the Markdown for the terms table text = '## 4 Vocabulary\n' if True: filtered_table = terms_sorted_by_localname #for category in range(0,len(display_order)): # if organized_in_categories: # text += '### 4.' + str(category + 1) + ' ' + display_label[category] + '\n' # text += '\n' # text += display_comments[category] # insert the comments for the category, if any. # filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]] # filtered_table.reset_index(drop=True, inplace=True) # else: # filtered_table = terms_sorted_by_localname for row_index,row in filtered_table.iterrows(): text += '\n' curie = row['pref_ns_prefix'] + ":" + row['term_localName'] curieAnchor = curie.replace(':','_') text += '\t\n' text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\n' text += '\t\n' text += '\t\n' text += '\t\t\n' text += '\t\t\t\n' uri = row['pref_ns_uri'] + row['term_localName'] text += '\t\t\t\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' if row['version_iri'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' if row['term_deprecated'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' for dep_index,dep_row in filtered_table.iterrows(): if dep_row['replaces_term'] == uri: text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' if dep_row['replaces1_term'] == uri: text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' #text += '\t\t\t\n' text += '\t\t\n' if row['dcterms_description'] != '': #if row['notes'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' #text += '\t\t\t\n' text += '\t\t\n' if row['examples'] != '': #if row['usage'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' #text += '\t\t\t\n' text += '\t\t\n' if row['tdwgutility_abcdEquivalence'] != '': text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' if vocab_type == 2 or vocab_type ==3: # controlled vocabulary text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' if vocab_type == 3 and row['skos_broader'] != '': # controlled vocabulary with skos:broader relationships text += '\t\t\n' text += '\t\t\t\n' curieAnchor = row['skos_broader'].replace(':','_') text += '\t\t\t\n' text += '\t\t\n' text += '\t\t\n' text += '\t\t\t\n' if row['rdf_type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property': #if row['type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property': text += '\t\t\t\n' elif row['rdf_type'] == 'http://www.w3.org/2000/01/rdf-schema#Class': #elif row['type'] == 'http://www.w3.org/2000/01/rdf-schema#Class': text += '\t\t\t\n' elif row['rdf_type'] == 'http://www.w3.org/2004/02/skos/core#Concept': #elif row['type'] == 'http://www.w3.org/2004/02/skos/core#Concept': text += '\t\t\t\n' else: text += '\t\t\t\n' # this should rarely happen #text += '\t\t\t\n' # this should rarely happen text += '\t\t\n' # Look up decisions related to this term for drow_index,drow in decisions_df.iterrows(): if drow['linked_affected_resource'] == uri: text += '\t\t\n' text += '\t\t\t\n' text += '\t\t\t\n' text += '\t\t\n' text += '\t\n' text += '
    Term Name ' + curie + '
    Term IRI' + uri + '
    Modified' + row['term_modified'] + '
    Term version IRI' + row['version_iri'] + '
    Label' + row['label'] + '
    This term is deprecated and should no longer be used.
    Is replaced by' + dep_row['pref_ns_uri'] + dep_row['term_localName'] + '
    Is replaced by' + dep_row['pref_ns_uri'] + dep_row['term_localName'] + '
    Definition' + row['rdfs_comment'] + '' + row['definition'] + '
    Notes' + convert_link(convert_code(row['dcterms_description'])) + '' + convert_link(convert_code(row['notes'])) + '
    Examples' + convert_examples(convert_link(convert_code(row['examples']))) + '' + convert_link(convert_code(row['usage'])) + '
    ABCD equivalence' + convert_link(convert_code(row['tdwgutility_abcdEquivalence'])) + '
    Controlled value' + row['controlled_value_string'] + '
    Has broader concept' + row['skos_broader'] + '
    TypePropertyClassConcept' + row['rdf_type'] + '' + row['type'] + '
    Executive Committee decisionhttp://rs.tdwg.org/decisions/' + drow['decision_localName'] + '
    \n' text += '\n' text += '\n' term_table = text print('done generating') print() #print(term_table) # --------------- # Merge term table with header and footer Markdown, then save file # --------------- print('Merging term table with header and footer and saving file') #text = index_by_label + term_table text = index_by_name + index_by_label + term_table # read in header and footer, merge with terms table, and output headerObject = open(headerFileName, 'rt', encoding='utf-8') header = headerObject.read() headerObject.close() # Build the Markdown for the contributors list contributors = '' for contributor in contributors_yaml: contributors += '[' + contributor['contributor_literal'] + '](' + contributor['contributor_iri'] + ') ' contributors += '([' + contributor['affiliation'] + '](' + contributor['affiliation_uri'] + ')), ' contributors = contributors[:-2] # Remove the last comma and space # Substitute values of ratification_date and contributors into the header template header = header.replace('{document_title}', document_configuration_yaml['documentTitle']) header = header.replace('{ratification_date}', document_configuration_yaml['doc_modified']) header = header.replace('{created_date}', document_configuration_yaml['doc_created']) header = header.replace('{contributors}', contributors) header = header.replace('{standard_iri}', document_configuration_yaml['dcterms_isPartOf']) header = header.replace('{current_iri}', document_configuration_yaml['current_iri']) header = header.replace('{abstract}', document_configuration_yaml['abstract']) header = header.replace('{creator}', document_configuration_yaml['creator']) header = header.replace('{publisher}', document_configuration_yaml['publisher']) year = document_configuration_yaml['doc_modified'].split('-')[0] header = header.replace('{year}', year) if has_namespace: header = header.replace('{namespace_uri}', namespace_uri) header = header.replace('{pref_namespace_prefix}', pref_namespace_prefix) # Determine whether there was a previous version of the document. if document_configuration_yaml['doc_created'] != document_configuration_yaml['doc_modified']: # Load versions list from document versions data in the rs.tdwg.org repo and find most recent version. versions_data_url = githubBaseUri + 'docs/docs-versions.csv' versions_list_df = pd.read_csv(versions_data_url, na_filter=False) # Slice all rows for versions of this document. matching_versions = versions_list_df[versions_list_df['current_iri']==document_configuration_yaml['current_iri']] # Sort the matching versions by version IRI in descending order so that the most recent version is first. matching_versions = matching_versions.sort_values(by=['version_iri'], ascending=[False]) # The previous version is the second row in the dataframe (row 1). # The version IRI is in the second column (column 1). most_recent_version_iri = matching_versions.iat[1, 1] #print(most_recent_version_iri) # Insert the previous version information into the header previous_version_metadata_string = '''Previous version : <''' + most_recent_version_iri + '''> ''' # Insert the previous version information into the designated slot. header = header.replace('{previous_version_slot}\n\n', previous_version_metadata_string) else: # If there was no previous version, remove the slot from the header. header = header.replace('{previous_version_slot}\n\n', '') footerObject = open(footerFileName, 'rt', encoding='utf-8') footer = footerObject.read() footerObject.close() output = header + text + footer outputObject = open(outFileName, 'wt', encoding='utf-8') outputObject.write(output) outputObject.close() print('done')