strogonoff · March 12, 2014 07:02 · Mar 12, 2014
diff --git a/hnsavedsearch.py b/hnsavedsearch.py
@@ -0,0 +1,145 @@
+#coding: utf-8
+
+u"""
+Searches your saved HN items. Many false positives (overkill with synonyms).
+..
+
+    $ pip install mechanize nltk
+    $ python hnsavedsearch.py username "space separated query"
+
+"""
+
+if __name__ != '__main__':
+    raise ImportError("hnsavedsearch isn't supposed to be imported")
+
+import argparse
+
+parser = argparse.ArgumentParser(
+    description="Search your HN saved stories by title text.")
+
+parser.add_argument('username', type=str)
+parser.add_argument('query', type=str)
+
+args = parser.parse_args()
+
+
+# Mechanize setup
+
+import mechanize
+import cookielib
+
+br = mechanize.Browser()
+
+cj = cookielib.LWPCookieJar()
+br.set_cookiejar(cj)
+
+br.set_handle_equiv(True)
+br.set_handle_gzip(True)
+br.set_handle_redirect(True)
+br.set_handle_referer(True)
+br.set_handle_robots(False)
+
+br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
+
+#br.set_debug_http(True)
+#br.set_debug_redirects(True)
+#br.set_debug_responses(True)
+
+br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
+
+
+# NLTK check
+
+try:
+    from nltk.corpus import wordnet
+    wordnet.synsets('cake')
+except LookupError:
+    print "wordnet corpus appears to be not installed, initiating download. Download to home directory!"
+    import nltk
+    result = nltk.download()
+    if result == True:
+        print "Installation hopefully successful"
+        from ntlk.corpus import wordnet
+
+
+# Log in
+
+import getpass
+
+br.open('https://news.ycombinator.com/newslogin?whence=news')
+br.select_form(nr=0)
+br.form['u'] = args.username
+br.form['p'] = getpass.getpass("Pass for %s: " % args.username)
+br.submit()
+
+
+# Prepare search
+
+def lemmas(words, synonyms=False):
+    from nltk import wordnet as wn
+
+    if synonyms:
+        lemmas = set(lemma
+            for word in words
+            for synset in wordnet.synsets(word)
+            for lemma in synset.lemma_names)
+
+    else:
+        lemmas = set(wn.WordNetLemmatizer().lemmatize(word)
+            for word in words)
+
+    return lemmas.union(set(words))
+
+query = lemmas(args.query.split())
+
+print "Original query: %s" % args.query
+print "      expanded: %s" % ', '.join(w for w in query)
+
+
+# Search
+
+def iterate_links(url):
+    global _page
+    global _links_processed
+    global _matches_found
+
+    _match = None
+
+    br.open(url)
+
+    for link in br.links():
+
+        # Internal links
+        if 'news.ycombinator.com' in link.absolute_url:
+
+            if link.url.startswith('item?id=') and _match is not None:
+                print "{:<30} \"{}\" on page {}".format(
+                    link.absolute_url, _match, _page)
+                _match = None
+                continue
+
+            elif link.text == "More":
+                _page += 1
+                iterate_links(link.absolute_url)
+                break
+
+        # External link
+        if query.intersection(lemmas(link.text.split(), True)):
+            _matches_found += 1
+            _match = link.text
+
+        _links_processed += 1
+
+
+try:
+    _page = 1
+    _matches_found = 0
+    _links_processed = 0
+
+    iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username)
+
+except KeyboardInterrupt:
+    print "\n"
+    print "Interrupted on page {}".format(_page)
+    print "Links processed: {}".format(_links_processed)
+    print "Matches found: {}".format(_matches_found)
No results found