Created
March 12, 2014 07:02
-
-
Save strogonoff/9502160 to your computer and use it in GitHub Desktop.
Revisions
-
strogonoff created this gist
Mar 12, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,145 @@ #coding: utf-8 u""" Searches your saved HN items. Many false positives (overkill with synonyms). .. $ pip install mechanize nltk $ python hnsavedsearch.py username "space separated query" """ if __name__ != '__main__': raise ImportError("hnsavedsearch isn't supposed to be imported") import argparse parser = argparse.ArgumentParser( description="Search your HN saved stories by title text.") parser.add_argument('username', type=str) parser.add_argument('query', type=str) args = parser.parse_args() # Mechanize setup import mechanize import cookielib br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # NLTK check try: from nltk.corpus import wordnet wordnet.synsets('cake') except LookupError: print "wordnet corpus appears to be not installed, initiating download. Download to home directory!" import nltk result = nltk.download() if result == True: print "Installation hopefully successful" from ntlk.corpus import wordnet # Log in import getpass br.open('https://news.ycombinator.com/newslogin?whence=news') br.select_form(nr=0) br.form['u'] = args.username br.form['p'] = getpass.getpass("Pass for %s: " % args.username) br.submit() # Prepare search def lemmas(words, synonyms=False): from nltk import wordnet as wn if synonyms: lemmas = set(lemma for word in words for synset in wordnet.synsets(word) for lemma in synset.lemma_names) else: lemmas = set(wn.WordNetLemmatizer().lemmatize(word) for word in words) return lemmas.union(set(words)) query = lemmas(args.query.split()) print "Original query: %s" % args.query print " expanded: %s" % ', '.join(w for w in query) # Search def iterate_links(url): global _page global _links_processed global _matches_found _match = None br.open(url) for link in br.links(): # Internal links if 'news.ycombinator.com' in link.absolute_url: if link.url.startswith('item?id=') and _match is not None: print "{:<30} \"{}\" on page {}".format( link.absolute_url, _match, _page) _match = None continue elif link.text == "More": _page += 1 iterate_links(link.absolute_url) break # External link if query.intersection(lemmas(link.text.split(), True)): _matches_found += 1 _match = link.text _links_processed += 1 try: _page = 1 _matches_found = 0 _links_processed = 0 iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username) except KeyboardInterrupt: print "\n" print "Interrupted on page {}".format(_page) print "Links processed: {}".format(_links_processed) print "Matches found: {}".format(_matches_found)