mirror of https://github.com/tdwg/dwc.git
132 lines
5.8 KiB
Python
132 lines
5.8 KiB
Python
# Script to insert header metadata into a document template
|
|
# This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
|
|
# Author: Steve Baskauf
|
|
|
|
script_version = '0.1.0'
|
|
version_modified = '2023-09-17'
|
|
|
|
# NOTE: This script should be run after the script that moves the previous document to a version.
|
|
|
|
import requests
|
|
import pandas as pd
|
|
import yaml
|
|
import sys
|
|
|
|
# -----------------
|
|
# Command line arguments
|
|
# -----------------
|
|
|
|
arg_vals = sys.argv[1:]
|
|
opts = [opt for opt in arg_vals if opt.startswith('-')]
|
|
args = [arg for arg in arg_vals if not arg.startswith('-')]
|
|
|
|
# Name of the last part of the URL of the doc
|
|
if '--slug' in opts:
|
|
document_slug = args[opts.index('--slug')]
|
|
else:
|
|
print('Must specify URL slug for document using --slug option')
|
|
exit()
|
|
|
|
# Used as the directory name
|
|
if '--dir' in opts:
|
|
directory_name = args[opts.index('--dir')]
|
|
else:
|
|
print('Must specify name of directory containing template and configs using --dir option')
|
|
exit()
|
|
|
|
# "master" for production, something else for development
|
|
if '--branch' in opts:
|
|
github_branch = args[opts.index('--branch')]
|
|
else:
|
|
github_branch = 'master'
|
|
|
|
|
|
# -----------------
|
|
# Configuration section
|
|
# -----------------
|
|
|
|
# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
|
|
github_base_url = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/' + github_branch + '/'
|
|
|
|
document_template_name = 'index_template.md' # This is the input file into which the header metadata will be inserted.
|
|
out_filename = '../docs/' + document_slug + '/index.md' # This is where the document rendered by GitHub Pages will be created.
|
|
|
|
config_file_path = 'process/document_metadata_processing/' + directory_name + '/'
|
|
contributors_yaml_file = 'authors_configuration.yaml'
|
|
document_configuration_yaml_file = 'document_configuration.yaml'
|
|
|
|
# Load the contributors YAML file from its GitHub URL
|
|
contributors_yaml_url = github_base_url + config_file_path + contributors_yaml_file
|
|
contributors_yaml = requests.get(contributors_yaml_url).text
|
|
if contributors_yaml == '404: Not Found':
|
|
print('Contributors YAML file not found. Check the URL.')
|
|
print(contributors_yaml_url)
|
|
exit()
|
|
contributors_yaml = yaml.load(contributors_yaml, Loader=yaml.FullLoader)
|
|
|
|
# Load the document configuration YAML file from its GitHub URL
|
|
document_configuration_yaml_url = github_base_url + config_file_path + document_configuration_yaml_file
|
|
document_configuration_yaml = requests.get(document_configuration_yaml_url).text
|
|
document_configuration_yaml = yaml.load(document_configuration_yaml, Loader=yaml.FullLoader)
|
|
|
|
# -----------------
|
|
# Main script
|
|
# -----------------
|
|
|
|
# Because this is a hack of the build-termlist.py script, "header" is used in variable names, although in this case
|
|
# it is the entire document template, not just the header.
|
|
headerObject = open(directory_name + '/' + document_template_name, 'rt', encoding='utf-8')
|
|
header = headerObject.read()
|
|
headerObject.close()
|
|
|
|
# Build the Markdown for the contributors list
|
|
contributors = ''
|
|
for contributor in contributors_yaml:
|
|
contributors += '[' + contributor['contributor_literal'] + '](' + contributor['contributor_iri'] + ') '
|
|
contributors += '([' + contributor['affiliation'] + '](' + contributor['affiliation_uri'] + ')), '
|
|
contributors = contributors[:-2] # Remove the last comma and space
|
|
|
|
# Substitute values of ratification_date and contributors into the header template
|
|
header = header.replace('{document_title}', document_configuration_yaml['documentTitle'])
|
|
header = header.replace('{ratification_date}', document_configuration_yaml['doc_modified'])
|
|
header = header.replace('{created_date}', document_configuration_yaml['doc_created'])
|
|
header = header.replace('{contributors}', contributors)
|
|
header = header.replace('{standard_iri}', document_configuration_yaml['dcterms_isPartOf'])
|
|
header = header.replace('{current_iri}', document_configuration_yaml['current_iri'])
|
|
header = header.replace('{abstract}', document_configuration_yaml['abstract'])
|
|
header = header.replace('{creator}', document_configuration_yaml['creator'])
|
|
header = header.replace('{publisher}', document_configuration_yaml['publisher'])
|
|
year = document_configuration_yaml['doc_modified'].split('-')[0]
|
|
header = header.replace('{year}', year)
|
|
|
|
# Determine whether there was a previous version of the document.
|
|
if document_configuration_yaml['doc_created'] != document_configuration_yaml['doc_modified']:
|
|
# Load versions list from document versions data in the rs.tdwg.org repo and find most recent version.
|
|
versions_data_url = github_base_url + 'docs/docs-versions.csv'
|
|
versions_list_df = pd.read_csv(versions_data_url, na_filter=False)
|
|
# Slice all rows for versions of this document.
|
|
matching_versions = versions_list_df[versions_list_df['current_iri']==document_configuration_yaml['current_iri']]
|
|
# Sort the matching versions by version IRI in descending order so that the most recent version is first.
|
|
matching_versions = matching_versions.sort_values(by=['version_iri'], ascending=[False])
|
|
# The previous version is the second row in the dataframe (row 1).
|
|
# The version IRI is in the second column (column 1).
|
|
most_recent_version_iri = matching_versions.iat[1, 1]
|
|
#print(most_recent_version_iri)
|
|
|
|
# Insert the previous version information into the header
|
|
previous_version_metadata_string = '''Previous version
|
|
: <''' + most_recent_version_iri + '''>
|
|
|
|
'''
|
|
# Insert the previous version information into the designated slot.
|
|
header = header.replace('{previous_version_slot}\n\n', previous_version_metadata_string)
|
|
else:
|
|
# If there was no previous version, remove the slot from the header.
|
|
header = header.replace('{previous_version_slot}\n\n', '')
|
|
|
|
outputObject = open(out_filename, 'wt', encoding='utf-8')
|
|
outputObject.write(header)
|
|
outputObject.close()
|
|
|
|
print('done')
|