#!/usr/bin/env python3 # Convert HTML markup from a file or stdin to plain text. # # Usage: # html2text.py #!/usr/bin/env python3 import sys from lxml import html from lxml.html import tostring from lxml.html.clean import Cleaner def sanitize(dirty_html): cleaner = Cleaner(page_structure=True, meta=True, embedded=True, links=True, style=True, processing_instructions=True, inline_style=True, scripts=True, javascript=True, comments=True, frames=True, forms=True, annoying_tags=True, remove_unknown_tags=True, safe_attrs_only=True, safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']), remove_tags=('span', 'font', 'div') ) return cleaner.clean_html(dirty_html) if len(sys.argv) > 1: fin = open(sys.argv[1], encoding='utf-8') else: fin = sys.stdin source = fin.read() source = sanitize(source) source = source.replace('
', '\n') tree = html.fromstring(source) plain = tostring(tree, method='text', encoding='utf-8') print(plain.decode('utf-8'))