#!/usr/bin/python3 import sys import json from pathlib import Path from urllib.request import urlopen from html.parser import HTMLParser class TableParser(HTMLParser): def __init__(self, column): super().__init__() self.extract_column = column self.current_col = 0 self.state = 'top' self.content = '' def handle_starttag(self, tag, attrs): if tag == 'table' and self.state == 'top': self.content = '' self.state = 'table' elif tag == 'tr' and self.state == 'table': self.state = 'tr' self.current_col = 0 elif tag == 'td' and self.state == 'tr': self.current_col += 1 def handle_endtag(self, tag): if tag == 'tr' and self.state == 'tr': self.state = 'table' elif tag == 'table' and self.state == 'table': self.state = 'top' def handle_data(self, data): if self.current_col == self.extract_column: self.content += data def main(): for rev in range(96, 107): parser = TableParser(column=2) with urlopen(f'https://discourse.ubuntu.com/posts/120902/revisions/{rev}.json') as fp: data = json.load(fp) changes = data['body_changes']['side_by_side_markdown'] parser.feed(changes) Path(f'revision{rev}.md').write_text(parser.content) if __name__ == '__main__': sys.exit(main())