#!/usr/bin/env python # -*- coding: utf-8 -*- """ Name: turl Usage: turl [--article-dir=] Options: --article-dir= where articles are stored [default: ./articles] """ import hashlib import io import os import shlex import subprocess import sys import docopt from readability.readability import Document def main(argv=sys.argv[1:]): args = docopt.docopt(__doc__, argv=argv) url = args[""] articles = args["--article-dir"] # How awful to spawn a subprocess to request the webpage. However, I have # found that using urllib or requests is problematic over a range of # websites. Mainly the problems are due to a requirement for additional # headers. But since curl 'just works' I am going with that for now. html = subprocess.check_output(shlex.split("curl -s {}".format(url))) title = Document(html).title() process = subprocess.Popen( shlex.split("lynx -dump -stdin"), stdout=subprocess.PIPE, stdin=subprocess.PIPE, ) summary = Document(html).summary().encode("utf-8") article, err = process.communicate(summary) document = u"url: {}\ntitle: {}\n--\n\n{}" document = document.format(url, title, article.decode("utf-8")) hash = hashlib.md5(document.encode("utf-8")).hexdigest() with io.open(os.path.join(articles, hash), "w") as fp: fp.write(document) print(hash) if __name__ == "__main__": main()