#!/usr/bin/env python3
# Convert HTML markup from a file or stdin to plain text.
#
# Usage:
# html2text.py
#!/usr/bin/env python3
import sys
from lxml import html
from lxml.html import tostring
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(page_structure=True,
meta=True,
embedded=True,
links=True,
style=True,
processing_instructions=True,
inline_style=True,
scripts=True,
javascript=True,
comments=True,
frames=True,
forms=True,
annoying_tags=True,
remove_unknown_tags=True,
safe_attrs_only=True,
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
remove_tags=('span', 'font', 'div')
)
return cleaner.clean_html(dirty_html)
if len(sys.argv) > 1:
fin = open(sys.argv[1], encoding='utf-8')
else:
fin = sys.stdin
source = fin.read()
source = sanitize(source)
source = source.replace('
', '\n')
tree = html.fromstring(source)
plain = tostring(tree, method='text', encoding='utf-8')
print(plain.decode('utf-8'))