#!/usr/bin/env python3 # OpenMPT help file scraper # by coda (https://coda.s3m.us/) and Saga Musix (https://sagamusix.de/) # This script downloads the OpenMPT manual TOC and then downloads all pages # from that TOC. The pages are parsed and all required image files are fetched. # The script also generates the appropriate files that can be fed into the # HTML Help Workshop to generate a CHM file. from urllib.request import urlopen, urlretrieve import re, os, shutil, subprocess base_url = 'https://wiki.openmpt.org' base_url_regex = 'https?://wiki.openmpt.org' os.chdir(os.path.dirname(os.path.abspath(__file__))) shutil.rmtree('html', ignore_errors=True) shutil.copytree('source', 'html') style = urlopen(base_url + '/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.page.gallery.styles%7Cmediawiki.skinning.interface%7Cskins.vector.styles%7Csite.styles&only=styles&skin=vector').read().decode('UTF-8') # Remove a few unused CSS classes style = re.sub(r'\}(\w+)?[\.#]vector([\w >]+)\{.+?\}', '}', style) style_file = open('html/style.css', 'w') style_file.write(style) style_file.close() toc_page = urlopen(base_url + '/index.php?title=Manual:_CHM_TOC&action=render').read().decode('UTF-8') pages = re.findall('href="' + base_url_regex + '/(.+?)"', toc_page) def destname(p): p = p.split(':_')[1] p = p.replace('/', '_') p = p.replace('.', '_') while p.find('__') >= 0: p = p.replace('__', '_') if p.find('#') >= 0: parts = p.split('#') return parts[0] + '.html#' + parts[1] return p + '.html' def title(p): p = p.split(':_')[1] p = p.replace('_', ' ') return p def localurl(p): p = destname(p) return p def replace_images(m): global base_url filepath = m.group(1) + '/' + m.group(2) + '/' filename = m.group(3) project.write(filename + "\n") urlretrieve(base_url + '/images/' + filepath + filename, 'html/' + filename) return '"' + filename + '"' def fix_internal_links(m): return '<a href="' + localurl(m.group(1)) + '"' project = open('html/OpenMPT Manual.hhp', 'w') project.write("""[OPTIONS] Compatibility=1.1 or later Compiled file=OpenMPT Manual.chm Contents file=OpenMPT Manual.hhc Display compile progress=No Full-text search=Yes Language=0x409 English (United States) Title=OpenMPT Manual Default Window=OpenMPT Default topic=""" + localurl(pages[0]) + """ [WINDOWS] OpenMPT=,"OpenMPT Manual.hhc",,""" + localurl(pages[0]) + """,,,,,,0x42520,215,0x300e,[20,20,780,580],0xb0000,,,,,,0 [FILES] style.css help.css bullet.png external.png """) for p in pages: content = urlopen(base_url + '/index.php?title=' + p + '&action=render').read().decode('UTF-8') # Download and replace image URLs content = re.sub(r' srcset=".+?"', '', content); content = re.sub(r'"/images/thumb/(\w+)/(\w+)/([^\/]+?)/([^\/]+?)"', replace_images, content) content = re.sub(r'"/images/(\w+)/(\w+)/([^\/]+?)"', replace_images, content) # Remove comments content = re.sub(r'<!--(.+?)-->', '', content, flags = re.DOTALL) # Fix local URLs content = re.sub(r'<a href="' + base_url_regex + '/File:', '<a href="', content) content = re.sub(r'<a href="' + base_url_regex + '/(Manual:.+?)"', fix_internal_links, content) content = re.sub(r'<a href="/(Manual:.+?)"', fix_internal_links, content) # Remove templates that shouldn't turn up in the manual content = re.sub(r'<div class="todo".+?</div>', '', content, flags = re.DOTALL); content = re.sub(r'<p class="newversion".+?</p>', '', content, flags = re.DOTALL); # Don't need this attribute in our CHM content = re.sub(r' rel="nofollow"', '', content); section = re.match(r'(.+)/', title(p)) section_str = '' if section: section_str = section.group(1) content = """<!DOCTYPE html> <html lang="en"> <head> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <link href="style.css" rel="stylesheet"> <link href="help.css" rel="stylesheet"> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>OpenMPT Manual - """ + title(p) + """</title> </head> <body> <h1>""" + title(p) + '</h1><div id="content" class="mw-body">' + content + '</div></body></html>' saved = open('html/' + destname(p), 'wb') saved.write(bytes(content, 'UTF-8')) saved.close() project.write(destname(p)+"\n") print(p) project.close() # Create TOC toc = open('html/OpenMPT Manual.hhc', 'w') toc.write(""" <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN"> <HTML> <HEAD> <meta name="GENERATOR" content="OpenMPT Help Generator"> <!-- Sitemap 1.0 --> </HEAD><BODY> <OBJECT type="text/site properties"> <param name="ImageType" value="Folder"> </OBJECT> """) def toc_parse(m): return """<OBJECT type="text/sitemap"> <param name="Name" value=\"""" + m.group(2) + """"> <param name="Local" value=\"""" + localurl(m.group(1)) + """"> </OBJECT>""" def toc_parse_chapter(m): return """<li><OBJECT type="text/sitemap"> <param name="Name" value=\"""" + m.group(1) + """"> </OBJECT>""" toc_text = re.sub(r'<!--(.+?)-->', '', toc_page, flags = re.DOTALL) toc_text = re.sub(r'<div(.+?)>', '', toc_text, flags = re.DOTALL) toc_text = re.sub(r'</div>', '', toc_text, flags = re.DOTALL) toc_text = re.sub(r'<a href="' + base_url_regex + '/(.+?)".*?>(.+?)</a>', toc_parse, toc_text) toc_text = re.sub(r'<li>([^<]+)$', toc_parse_chapter, toc_text, flags = re.MULTILINE) toc.write(toc_text) toc.write(""" </BODY></HTML> """) toc.close() if(subprocess.call(['../../build/tools/htmlhelp/hhc.exe', '"html/OpenMPT Manual.hhp"']) != 1): raise Exception("Something went wrong during manual creation!") try: os.remove('../../packageTemplate/html/OpenMPT Manual.chm') except OSError: pass shutil.copy2('html/OpenMPT Manual.chm', '../../packageTemplate/')