Skip to content

Instantly share code, notes, and snippets.

@strogonoff
Created March 12, 2014 07:02
Show Gist options
  • Select an option

  • Save strogonoff/9502160 to your computer and use it in GitHub Desktop.

Select an option

Save strogonoff/9502160 to your computer and use it in GitHub Desktop.

Revisions

  1. strogonoff created this gist Mar 12, 2014.
    145 changes: 145 additions & 0 deletions hnsavedsearch.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,145 @@
    #coding: utf-8

    u"""
    Searches your saved HN items. Many false positives (overkill with synonyms).
    ..
    $ pip install mechanize nltk
    $ python hnsavedsearch.py username "space separated query"
    """

    if __name__ != '__main__':
    raise ImportError("hnsavedsearch isn't supposed to be imported")

    import argparse

    parser = argparse.ArgumentParser(
    description="Search your HN saved stories by title text.")

    parser.add_argument('username', type=str)
    parser.add_argument('query', type=str)

    args = parser.parse_args()


    # Mechanize setup

    import mechanize
    import cookielib

    br = mechanize.Browser()

    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)

    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]


    # NLTK check

    try:
    from nltk.corpus import wordnet
    wordnet.synsets('cake')
    except LookupError:
    print "wordnet corpus appears to be not installed, initiating download. Download to home directory!"
    import nltk
    result = nltk.download()
    if result == True:
    print "Installation hopefully successful"
    from ntlk.corpus import wordnet


    # Log in

    import getpass

    br.open('https://news.ycombinator.com/newslogin?whence=news')
    br.select_form(nr=0)
    br.form['u'] = args.username
    br.form['p'] = getpass.getpass("Pass for %s: " % args.username)
    br.submit()


    # Prepare search

    def lemmas(words, synonyms=False):
    from nltk import wordnet as wn

    if synonyms:
    lemmas = set(lemma
    for word in words
    for synset in wordnet.synsets(word)
    for lemma in synset.lemma_names)

    else:
    lemmas = set(wn.WordNetLemmatizer().lemmatize(word)
    for word in words)

    return lemmas.union(set(words))

    query = lemmas(args.query.split())

    print "Original query: %s" % args.query
    print " expanded: %s" % ', '.join(w for w in query)


    # Search

    def iterate_links(url):
    global _page
    global _links_processed
    global _matches_found

    _match = None

    br.open(url)

    for link in br.links():

    # Internal links
    if 'news.ycombinator.com' in link.absolute_url:

    if link.url.startswith('item?id=') and _match is not None:
    print "{:<30} \"{}\" on page {}".format(
    link.absolute_url, _match, _page)
    _match = None
    continue

    elif link.text == "More":
    _page += 1
    iterate_links(link.absolute_url)
    break

    # External link
    if query.intersection(lemmas(link.text.split(), True)):
    _matches_found += 1
    _match = link.text

    _links_processed += 1


    try:
    _page = 1
    _matches_found = 0
    _links_processed = 0

    iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username)

    except KeyboardInterrupt:
    print "\n"
    print "Interrupted on page {}".format(_page)
    print "Links processed: {}".format(_links_processed)
    print "Matches found: {}".format(_matches_found)