change list of terms build script to use templated header

2023-09-16 17:38:57 -05:00 · 2023-09-16 17:38:57 -05:00 · f4d9e6b997
parent 5ae14c4ca1
commit f4d9e6b997
2 changed files with 100 additions and 16 deletions
--- a/build/build-termlist.py
+++ b/build/build-termlist.py
@ -8,6 +8,7 @@ import requests   # best library to manage HTTP transactions
 import csv        # library to read/write/parse CSV files
 import json       # library to convert JSON to Python data structures
 import pandas as pd
 import yaml
 # -----------------
 # Configuration section
@ -21,8 +22,10 @@ import pandas as pd
 # headers are not. To build a page using the sample files, you will need to reverse the
 # commenting of these pairs.
 github_branch = 'main' # "main" for production, something else for development
 # This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
-githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/materialentity/'
+githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/' + github_branch + '/'
 headerFileName = 'termlist-header.md'
 footerFileName = 'termlist-footer.md'
@ -32,6 +35,10 @@ outFileName = '../docs/list/index.md'
 termLists = ['terms', 'iri', 'dc-for-dwc', 'dcterms-for-dwc']
 #termLists = ['pathway']
 # If this list of terms is for terms in a single namespace, set the value of has_namespace to True. The value
 # of has_namespace should be False for a list of terms that contains multiple namespaces.
 has_namespace = False
 # NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ
 # However, there probably aren't any circumstances where mixed types will be used to generate the same page.
 vocab_type = 1 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy
@ -52,6 +59,35 @@ display_id = ['record_level', 'dc', 'dcterms', 'occurrence', 'organism', 'materi
 #display_comments = [''] # these are the comments about the category to be appended following the section labels
 #display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories
 # ---------------
 # Load header data
 # ---------------
 config_file_path = 'process/document_metadata_processing/dwc_doc_list/'
 contributors_yaml_file = 'authors_configuration.yaml'
 document_configuration_yaml_file = 'document_configuration.yaml'
 if has_namespace:
    # Load the configuration file used in the metadata creation process.
    metadata_config_text = requests.get(githubBaseUri + 'process/config.yaml').text
    metadata_config = yaml.load(metadata_config_text, Loader=yaml.FullLoader)
    namespace_uri = metadata_config['namespaces'][0]['namespace_uri']
    pref_namespace_prefix = metadata_config['namespaces'][0]['pref_namespace_prefix']
 # Load the contributors YAML file from its GitHub URL
 contributors_yaml_url = githubBaseUri + config_file_path + contributors_yaml_file
 contributors_yaml = requests.get(contributors_yaml_url).text
 if contributors_yaml == '404: Not Found':
    print('Contributors YAML file not found. Check the URL.')
    print(contributors_yaml_url)
    exit()
 contributors_yaml = yaml.load(contributors_yaml, Loader=yaml.FullLoader)
 # Load the document configuration YAML file from its GitHub URL
 document_configuration_yaml_url = githubBaseUri + config_file_path + document_configuration_yaml_file
 document_configuration_yaml = requests.get(document_configuration_yaml_url).text
 document_configuration_yaml = yaml.load(document_configuration_yaml, Loader=yaml.FullLoader)
 # ---------------
 # Function definitions
 # ---------------
@ -178,6 +214,7 @@ for term_list in term_lists_info:
        table_list.append(row_list)
 print('processing data')
 # Turn list of lists into dataframe
 terms_df = pd.DataFrame(table_list, columns = column_list)
@ -432,6 +469,54 @@ headerObject = open(headerFileName, 'rt', encoding='utf-8')
 header = headerObject.read()
 headerObject.close()
 # Build the Markdown for the contributors list
 contributors = ''
 for contributor in contributors_yaml:
    contributors += '[' + contributor['contributor_literal'] + '](' + contributor['contributor_iri'] + ') '
    contributors += '([' + contributor['affiliation'] + '](' + contributor['affiliation_uri'] + ')), '
 contributors = contributors[:-2] # Remove the last comma and space
 # Substitute values of ratification_date and contributors into the header template
 header = header.replace('{document_title}', document_configuration_yaml['documentTitle'])
 header = header.replace('{ratification_date}', document_configuration_yaml['doc_modified'])
 header = header.replace('{created_date}', document_configuration_yaml['doc_created'])
 header = header.replace('{contributors}', contributors)
 header = header.replace('{standard_iri}', document_configuration_yaml['dcterms_isPartOf'])
 header = header.replace('{current_iri}', document_configuration_yaml['current_iri'])
 header = header.replace('{abstract}', document_configuration_yaml['abstract'])
 header = header.replace('{creator}', document_configuration_yaml['creator'])
 header = header.replace('{publisher}', document_configuration_yaml['publisher'])
 year = document_configuration_yaml['doc_modified'].split('-')[0]
 header = header.replace('{year}', year)
 if has_namespace:
    header = header.replace('{namespace_uri}', namespace_uri)
    header = header.replace('{pref_namespace_prefix}', pref_namespace_prefix)
 # Determine whether there was a previous version of the document.
 if document_configuration_yaml['doc_created'] != document_configuration_yaml['doc_modified']:
    # Load versions list from document versions data in the rs.tdwg.org repo and find most recent version.
    versions_data_url = githubBaseUri + 'docs/docs-versions.csv'
    versions_list_df = pd.read_csv(versions_data_url, na_filter=False)
    # Slice all rows for versions of this document.
    matching_versions = versions_list_df[versions_list_df['current_iri']==document_configuration_yaml['current_iri']]
    # Sort the matching versions by version IRI in descending order so that the most recent version is first.
    matching_versions = matching_versions.sort_values(by=['version_iri'], ascending=[False])
    # The previous version is the second row in the dataframe (row 1).
    # The version IRI is in the second column (column 1).
    most_recent_version_iri = matching_versions.iat[1, 1]
    #print(most_recent_version_iri)
    # Insert the previous version information into the header
    previous_version_metadata_string = '''Previous version
 : <''' + most_recent_version_iri + '''>
 '''
    # Insert the previous version information into the designated slot.
    header = header.replace('{previous_version_slot}\n\n', previous_version_metadata_string)
 else:
    # If there was no previous version, remove the slot from the header.
    header = header.replace('{previous_version_slot}\n\n', '')
 footerObject = open(footerFileName, 'rt', encoding='utf-8')
 footer = footerObject.read()
 footerObject.close()
--- a/build/termlist-header.md
+++ b/build/termlist-header.md
@ -1,42 +1,41 @@
-# List of Darwin Core terms
+# {document_title}
 Title
-: List of Darwin Core terms
+: {document_title}
 Date version issued
-: 2023-08-18
+: {ratification_date}
 Date created
-: 2020-08-12
+: {created_date}
 Part of TDWG Standard
-: <http://www.tdwg.org/standards/450>
+: <{standard_iri}>
 This version
-: <http://rs.tdwg.org/dwc/doc/list/2023-08-18>
+: <{current_iri}{ratification_date}>
 Latest version
-: <http://rs.tdwg.org/dwc/doc/list/>
+: <{current_iri}>
-Previous version
+{previous_version_slot}
 : <http://rs.tdwg.org/dwc/doc/list/2023-07-07>
 Abstract
-: Darwin Core is a vocabulary standard for transmitting information about biodiversity. This document lists all terms in namespaces currently used in the vocabulary.
+: {abstract}
 Contributors
-: John Wieczorek (VertNet), Peter Desmet (INBO), Steve Baskauf (Vanderbilt University Libraries), Tim Robertson (GBIF), Markus Döring (GBIF), Quentin Groom (Botanic Garden Meise), Stijn Van Hoey (INBO), David Bloom (VertNet), Paula Zermoglio (VertNet), Robert Guralnick (University of Florida), John Deck (Genomic Biodiversity Working Group), Gail Kampmeier (INHS), Dave Vieglais (KUNHM), Renato De Giovanni (CRIA), Campbell Webb (TDWG RDF/OWL Task Group), Paul J. Morris (Harvard University Herbaria/Museum of Comparative Zoölogy), Mark Schildhauer (NCEAS)
+: {contributors}
 Creator
-: TDWG Darwin Core Maintenance Group
+: {creator}
 Bibliographic citation
-: Darwin Core Maintenance Group. 2023. List of Darwin Core terms. Biodiversity Information Standards (TDWG). <http://rs.tdwg.org/dwc/doc/list/2023-08-18>
+: {creator}. {year}. {document_title}. {publisher}. <{current_iri}{ratification_date}>
 ## 1 Introduction (Informative)
-This document contains terms that are part of the most recent version of the Darwin Core vocabulary (<http://rs.tdwg.org/version/dwc/2023-08-18>).
+This document contains terms that are part of the most recent version of the Darwin Core vocabulary (<http://rs.tdwg.org/version/dwc/{ratification_date}>).
 This document includes terms in four namespaces that contain recommended terms: `dwc:`, `dwciri:`, `dc:`, and `dcterms:`. However, some terms in these namespaces are deprecated or superseded and should no longer be used. Deprecation or supersession is noted in the term metadata. Namespaces that contain only deprecated terms are not included in this document, but metadata about those terms can be retrieved by dereferencing their IRIs.
@ -51,7 +50,7 @@ Section 2 is normative.
 In Section 4, the values of the `Term IRI` and `Definition` are normative. The values of `Term Name` are non-normative, although one can expect that the namespace abbreviation prefix is one commonly used for the term namespace.  `Label` and the values of all other properties (such as `Examples` and `Notes`) are non-normative.
 ### 1.2 RFC 2119 key words
-The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://tools.ietf.org/html/rfc2119).
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [BCP 14](https://www.rfc-editor.org/info/bcp14) [\[RFC 2119\]](https://datatracker.ietf.org/doc/html/rfc2119) and [\[RFC 8174\]](https://datatracker.ietf.org/doc/html/rfc8174) when, and only when, they appear in all capitals, as shown here.
 ### 1.3 Namespace abbreviations