#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Name:
    turl

Usage:
    turl [--article-dir=<adir>] <url>

Options:
    --article-dir=<adir>   where articles are stored [default: ./articles]

"""


import hashlib
import io
import os
import shlex
import subprocess
import sys

import docopt

from readability.readability import Document


def main(argv=sys.argv[1:]):
    args = docopt.docopt(__doc__, argv=argv)

    url = args["<url>"]
    articles = args["--article-dir"]

    # How awful to spawn a subprocess to request the webpage. However, I have
    # found that using urllib or requests is problematic over a range of
    # websites. Mainly the problems are due to a requirement for additional
    # headers. But since curl 'just works' I am going with that for now.
    html = subprocess.check_output(shlex.split("curl -s {}".format(url)))

    title = Document(html).title()

    process = subprocess.Popen(
            shlex.split("lynx -dump -stdin"),
            stdout=subprocess.PIPE,
            stdin=subprocess.PIPE,
            )

    summary = Document(html).summary().encode("utf-8")
    article, err = process.communicate(summary)

    document = u"url: {}\ntitle: {}\n--\n\n{}"
    document = document.format(url, title, article.decode("utf-8"))

    hash = hashlib.md5(document.encode("utf-8")).hexdigest()

    with io.open(os.path.join(articles, hash), "w") as fp:
        fp.write(document)

    print(hash)


if __name__ == "__main__":
    main()