diff --git a/build/build.py b/build/build.py index 2f45db7..b9faa83 100644 --- a/build/build.py +++ b/build/build.py @@ -164,13 +164,30 @@ class DwcDigester(object): term_data["iri"] = term_iri term_data["label"] = vs_term['label'] term_data["class"] = cf_term['organized_in'] - term_data["definition"] = vs_term['definition'] - term_data["comments"] = cf_term['comments'] + term_data["definition"] = self.convert_link(vs_term['definition']) + term_data["comments"] = self.convert_link(cf_term['comments']) term_data["rdf_type"] = vs_term['rdf_type'] namespace_url, _ = self.split_iri(term_iri) term_data["namespace"] = self.resolve_namespace_abbrev(namespace_url) return term_data + @staticmethod + def convert_link(text_with_urls): + """ + + Notes + ------ + The underlying regex is not a general URL matcher and could have shortcomings... + """ + def _handle_matched(inputstring): + """quick hack version of url handling on the current prime versions data""" + url = inputstring.group() + if url.endswith("."): # not included in regex to notice the special 'end of . case' + url = url[:-1] + return "{}".format(url, url) + regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-=\\\.&]*)(?