Update and add more tool scripts

2026-02-21 22:52:15 +13:00 · 2022-09-18 23:25:22 -04:00 · 2022-09-18 23:25:22 -04:00 · 208ea07237
commit 208ea07237
parent 3520a4c99a
12 changed files with 3207 additions and 95 deletions
--- a/tools/toc.py
+++ b/tools/toc.py
@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Usage: python toc.py file.md
+
+Replace a "## TOC" heading in a Markdown file with a table of contents,
+generated from the other headings in the file. Supports multiple files.
+Headings must start with "##" signs to be detected.
+"""
+
+import sys
+import re
+from collections import namedtuple
+from urllib.parse import quote
+
+toc_name = 'Contents'
+valid_toc_headings = {'## TOC', '##TOC'}
+
+TocItem = namedtuple('TocItem', ['name', 'anchor', 'level'])
+punctuation_rx = re.compile(r'[^\w\- ]+')
+numbered_heading_rx = re.compile(r'^[0-9]+\. ')
+specialchar_rx = re.compile(r'[⅔]+')
+
+def name_to_anchor(name):
+	# GitHub's algorithm for generating anchors from headings
+	# https://github.com/jch/html-pipeline/blob/master/lib/html/pipeline/toc_filter.rb
+	anchor = name.strip().lower()               # lowercase
+	anchor = re.sub(punctuation_rx, '', anchor) # remove punctuation
+	anchor = anchor.replace(' ', '-')           # replace spaces with dash
+	anchor = re.sub(specialchar_rx, '', anchor) # remove misc special chars
+	anchor = quote(anchor)                      # url encode
+	return anchor
+
+def get_toc_index(lines):
+	toc_index = None
+	for i, line in enumerate(lines):
+		if line.rstrip() in valid_toc_headings:
+			toc_index = i
+			break
+	return toc_index
+
+def get_toc_items(lines, toc_index):
+	for i, line in enumerate(lines):
+		if i <= toc_index:
+			continue
+		if line.startswith('##'):
+			name = line.lstrip('#')
+			level = len(line) - len(name) - len('##')
+			name = name.strip()
+			anchor = name_to_anchor(name)
+			yield TocItem(name, anchor, level)
+
+def toc_string(toc_items):
+	lines = [f'## {toc_name}', '']
+	for name, anchor, level in toc_items:
+		padding = '  ' * level
+		if re.match(numbered_heading_rx, name):
+			bullet, name = name.split('.', 1)
+			bullet += '.'
+			name = name.lstrip()
+		else:
+			bullet = '-'
+		lines.append(f'{padding}{bullet} [{name}](#{anchor})')
+	return '\n'.join(lines) + '\n'
+
+def add_toc(filename):
+	with open(filename, 'r', encoding='utf-8') as file:
+		lines = file.readlines()
+	toc_index = get_toc_index(lines)
+	if toc_index is None:
+		return None # no TOC heading
+	toc_items = list(get_toc_items(lines, toc_index))
+	if not toc_items:
+		return False # no content headings
+	with open(filename, 'w', encoding='utf-8') as file:
+		for i, line in enumerate(lines):
+			if i == toc_index:
+				file.write(toc_string(toc_items))
+			else:
+				file.write(line)
+	return True # OK
+
+def main():
+	if len(sys.argv) < 2:
+		print(f'Usage: {sys.argv[0]} file.md', file=sys.stderr)
+		sys.exit(1)
+	for filename in sys.argv[1:]:
+		print(filename)
+		result = add_toc(filename)
+		if result is None:
+			print('Warning: No "## TOC" heading found', file=sys.stderr)
+		elif result is False:
+			print('Warning: No content headings found', file=sys.stderr)
+		else:
+			print('OK')
+
+if __name__ == '__main__':
+	main()