From f4d9e6b9976d3b3010b93d31b07b6bfc54eab8f6 Mon Sep 17 00:00:00 2001
From: Steve Baskauf <steve.baskauf@vanderbilt.edu>
Date: Sat, 16 Sep 2023 17:38:57 -0500
Subject: [PATCH] change list of terms build script to use templated header

---
 build/build-termlist.py  | 87 +++++++++++++++++++++++++++++++++++++++-
 build/termlist-header.md | 29 +++++++-------
 2 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/build/build-termlist.py b/build/build-termlist.py
index eff03c1..237ea10 100644
--- a/build/build-termlist.py
+++ b/build/build-termlist.py
@@ -8,6 +8,7 @@ import requests   # best library to manage HTTP transactions
 import csv        # library to read/write/parse CSV files
 import json       # library to convert JSON to Python data structures
 import pandas as pd
+import yaml
 
 # -----------------
 # Configuration section
@@ -21,8 +22,10 @@ import pandas as pd
 # headers are not. To build a page using the sample files, you will need to reverse the
 # commenting of these pairs.
 
+github_branch = 'main' # "main" for production, something else for development
+
 # This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
-githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/materialentity/'
+githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/' + github_branch + '/'
 
 headerFileName = 'termlist-header.md'
 footerFileName = 'termlist-footer.md'
@@ -32,6 +35,10 @@ outFileName = '../docs/list/index.md'
 termLists = ['terms', 'iri', 'dc-for-dwc', 'dcterms-for-dwc']
 #termLists = ['pathway']
 
+# If this list of terms is for terms in a single namespace, set the value of has_namespace to True. The value
+# of has_namespace should be False for a list of terms that contains multiple namespaces.
+has_namespace = False
+
 # NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ
 # However, there probably aren't any circumstances where mixed types will be used to generate the same page.
 vocab_type = 1 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy
@@ -52,6 +59,35 @@ display_id = ['record_level', 'dc', 'dcterms', 'occurrence', 'organism', 'materi
 #display_comments = [''] # these are the comments about the category to be appended following the section labels
 #display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories
 
+# ---------------
+# Load header data
+# ---------------
+
+config_file_path = 'process/document_metadata_processing/dwc_doc_list/'
+contributors_yaml_file = 'authors_configuration.yaml'
+document_configuration_yaml_file = 'document_configuration.yaml'
+
+if has_namespace:
+    # Load the configuration file used in the metadata creation process.
+    metadata_config_text = requests.get(githubBaseUri + 'process/config.yaml').text
+    metadata_config = yaml.load(metadata_config_text, Loader=yaml.FullLoader)
+    namespace_uri = metadata_config['namespaces'][0]['namespace_uri']
+    pref_namespace_prefix = metadata_config['namespaces'][0]['pref_namespace_prefix']
+
+# Load the contributors YAML file from its GitHub URL
+contributors_yaml_url = githubBaseUri + config_file_path + contributors_yaml_file
+contributors_yaml = requests.get(contributors_yaml_url).text
+if contributors_yaml == '404: Not Found':
+    print('Contributors YAML file not found. Check the URL.')
+    print(contributors_yaml_url)
+    exit()
+contributors_yaml = yaml.load(contributors_yaml, Loader=yaml.FullLoader)
+
+# Load the document configuration YAML file from its GitHub URL
+document_configuration_yaml_url = githubBaseUri + config_file_path + document_configuration_yaml_file
+document_configuration_yaml = requests.get(document_configuration_yaml_url).text
+document_configuration_yaml = yaml.load(document_configuration_yaml, Loader=yaml.FullLoader)
+
 # ---------------
 # Function definitions
 # ---------------
@@ -178,6 +214,7 @@ for term_list in term_lists_info:
 
         table_list.append(row_list)
 
+print('processing data')
 # Turn list of lists into dataframe
 terms_df = pd.DataFrame(table_list, columns = column_list)
 
@@ -432,6 +469,54 @@ headerObject = open(headerFileName, 'rt', encoding='utf-8')
 header = headerObject.read()
 headerObject.close()
 
+# Build the Markdown for the contributors list
+contributors = ''
+for contributor in contributors_yaml:
+    contributors += '[' + contributor['contributor_literal'] + '](' + contributor['contributor_iri'] + ') '
+    contributors += '([' + contributor['affiliation'] + '](' + contributor['affiliation_uri'] + ')), '
+contributors = contributors[:-2] # Remove the last comma and space
+
+# Substitute values of ratification_date and contributors into the header template
+header = header.replace('{document_title}', document_configuration_yaml['documentTitle'])
+header = header.replace('{ratification_date}', document_configuration_yaml['doc_modified'])
+header = header.replace('{created_date}', document_configuration_yaml['doc_created'])
+header = header.replace('{contributors}', contributors)
+header = header.replace('{standard_iri}', document_configuration_yaml['dcterms_isPartOf'])
+header = header.replace('{current_iri}', document_configuration_yaml['current_iri'])
+header = header.replace('{abstract}', document_configuration_yaml['abstract'])
+header = header.replace('{creator}', document_configuration_yaml['creator'])
+header = header.replace('{publisher}', document_configuration_yaml['publisher'])
+year = document_configuration_yaml['doc_modified'].split('-')[0]
+header = header.replace('{year}', year)
+if has_namespace:
+    header = header.replace('{namespace_uri}', namespace_uri)
+    header = header.replace('{pref_namespace_prefix}', pref_namespace_prefix)
+
+# Determine whether there was a previous version of the document.
+if document_configuration_yaml['doc_created'] != document_configuration_yaml['doc_modified']:
+    # Load versions list from document versions data in the rs.tdwg.org repo and find most recent version.
+    versions_data_url = githubBaseUri + 'docs/docs-versions.csv'
+    versions_list_df = pd.read_csv(versions_data_url, na_filter=False)
+    # Slice all rows for versions of this document.
+    matching_versions = versions_list_df[versions_list_df['current_iri']==document_configuration_yaml['current_iri']]
+    # Sort the matching versions by version IRI in descending order so that the most recent version is first.
+    matching_versions = matching_versions.sort_values(by=['version_iri'], ascending=[False])
+    # The previous version is the second row in the dataframe (row 1).
+    # The version IRI is in the second column (column 1).
+    most_recent_version_iri = matching_versions.iat[1, 1]
+    #print(most_recent_version_iri)
+
+    # Insert the previous version information into the header
+    previous_version_metadata_string = '''Previous version
+: <''' + most_recent_version_iri + '''>
+
+'''
+    # Insert the previous version information into the designated slot.
+    header = header.replace('{previous_version_slot}\n\n', previous_version_metadata_string)
+else:
+    # If there was no previous version, remove the slot from the header.
+    header = header.replace('{previous_version_slot}\n\n', '')
+
 footerObject = open(footerFileName, 'rt', encoding='utf-8')
 footer = footerObject.read()
 footerObject.close()
diff --git a/build/termlist-header.md b/build/termlist-header.md
index 6f61d2b..648f51f 100644
--- a/build/termlist-header.md
+++ b/build/termlist-header.md
@@ -1,42 +1,41 @@
-# List of Darwin Core terms
+# {document_title}
 
 Title
-: List of Darwin Core terms
+: {document_title}
 
 Date version issued
-: 2023-08-18
+: {ratification_date}
 
 Date created
-: 2020-08-12
+: {created_date}
 
 Part of TDWG Standard
-: <http://www.tdwg.org/standards/450>
+: <{standard_iri}>
 
 This version
-: <http://rs.tdwg.org/dwc/doc/list/2023-08-18>
+: <{current_iri}{ratification_date}>
 
 Latest version
-: <http://rs.tdwg.org/dwc/doc/list/>
+: <{current_iri}>
 
-Previous version
-: <http://rs.tdwg.org/dwc/doc/list/2023-07-07>
+{previous_version_slot}
 
 Abstract
-: Darwin Core is a vocabulary standard for transmitting information about biodiversity. This document lists all terms in namespaces currently used in the vocabulary.
+: {abstract}
 
 Contributors
-: John Wieczorek (VertNet), Peter Desmet (INBO), Steve Baskauf (Vanderbilt University Libraries), Tim Robertson (GBIF), Markus Döring (GBIF), Quentin Groom (Botanic Garden Meise), Stijn Van Hoey (INBO), David Bloom (VertNet), Paula Zermoglio (VertNet), Robert Guralnick (University of Florida), John Deck (Genomic Biodiversity Working Group), Gail Kampmeier (INHS), Dave Vieglais (KUNHM), Renato De Giovanni (CRIA), Campbell Webb (TDWG RDF/OWL Task Group), Paul J. Morris (Harvard University Herbaria/Museum of Comparative Zoölogy), Mark Schildhauer (NCEAS)
+: {contributors}
 
 Creator
-: TDWG Darwin Core Maintenance Group
+: {creator}
 
 Bibliographic citation
-: Darwin Core Maintenance Group. 2023. List of Darwin Core terms. Biodiversity Information Standards (TDWG). <http://rs.tdwg.org/dwc/doc/list/2023-08-18>
+: {creator}. {year}. {document_title}. {publisher}. <{current_iri}{ratification_date}>
 
 
 ## 1 Introduction (Informative)
 
-This document contains terms that are part of the most recent version of the Darwin Core vocabulary (<http://rs.tdwg.org/version/dwc/2023-08-18>).
+This document contains terms that are part of the most recent version of the Darwin Core vocabulary (<http://rs.tdwg.org/version/dwc/{ratification_date}>).
 
 This document includes terms in four namespaces that contain recommended terms: `dwc:`, `dwciri:`, `dc:`, and `dcterms:`. However, some terms in these namespaces are deprecated or superseded and should no longer be used. Deprecation or supersession is noted in the term metadata. Namespaces that contain only deprecated terms are not included in this document, but metadata about those terms can be retrieved by dereferencing their IRIs.
 
@@ -51,7 +50,7 @@ Section 2 is normative.
 In Section 4, the values of the `Term IRI` and `Definition` are normative. The values of `Term Name` are non-normative, although one can expect that the namespace abbreviation prefix is one commonly used for the term namespace.  `Label` and the values of all other properties (such as `Examples` and `Notes`) are non-normative.
 
 ### 1.2 RFC 2119 key words
-The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://tools.ietf.org/html/rfc2119).
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [BCP 14](https://www.rfc-editor.org/info/bcp14) [\[RFC 2119\]](https://datatracker.ietf.org/doc/html/rfc2119) and [\[RFC 8174\]](https://datatracker.ietf.org/doc/html/rfc8174) when, and only when, they appear in all capitals, as shown here.
 
 ### 1.3 Namespace abbreviations