-
-
Save yongjin-shin/6d1eb3ab4b234183823f93c90d749ce6 to your computer and use it in GitHub Desktop.
Revisions
-
typehorror revised this gist
Aug 4, 2014 . 1 changed file with 5 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -74,10 +74,11 @@ def get_urls(self, domain): return all the URLS within a domain """ self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,)) # could use fetchone and yield but I want to release # my cursor after the call. I could have create a new cursor tho. # ...Oh well return [row[0] for row in self.cursor.fetchall()] class Crawler(object): def __init__(self, cache=None, depth=2): -
typehorror revised this gist
Aug 3, 2014 . 1 changed file with 21 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- # filename: crawler.py import sqlite3 import urllib2 from HTMLParser import HTMLParser from urlparse import urlparse class HREFParser(HTMLParser): """ Parser that extracts hrefs """ @@ -19,7 +19,7 @@ def handle_starttag(self, tag, attrs): self.hrefs.add(dict_attrs['href']) def get_local_links(html, domain): """ Read through HTML content and returns a tuple of links internal to the given domain @@ -39,7 +39,10 @@ def get_local_links(html, domain): return hrefs class CrawlerCache(object): """ Crawler data caching per relative URL and domain. """ def __init__(self, db_file): self.conn = sqlite3.connect(db_file) c = self.conn.cursor() @@ -60,15 +63,23 @@ def get(self, domain, url): """ return the content for a given domain and relative url """ self.cursor.execute("SELECT content FROM sites WHERE domain=? and url=?", (domain, url)) row = self.cursor.fetchone() if row: return row[0] def get_urls(self, domain): """ return all the URLS within a domain """ self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,)) row = self.cursor.fetchone() while row: yield row[0] row = self.cursor.fetchone() class Crawler(object): def __init__(self, cache=None, depth=2): """ depth: how many time it will bounce from page one (optional) -
typehorror revised this gist
Aug 3, 2014 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,3 @@ # -*- coding: utf-8 -*- # filename: crawler.py -
typehorror revised this gist
Aug 3, 2014 . 1 changed file with 2 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -29,13 +29,12 @@ def get_local_links(html, domain): parser = HREFParser() parser.feed(html) for href in parser.hrefs: u_parse = urlparse(href) if href.startswith('/'): # purposefully using path, no query, no hash hrefs.add(u_parse.path) else: # only keep the local urls if u_parse.netloc == domain: hrefs.add(u_parse.path) return hrefs -
typehorror revised this gist
Aug 3, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ # Simple Website Crawler The following gist is an extract of the article [Building a simple crawler](http://www.debrice.com/building-a-simple-crawler/). It allows crawling from a URL and for a given number of bounce. -
typehorror created this gist
Aug 3, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,22 @@ # Simple Python Website Crawler The following gist is an extract of the article [Building a simple crawler](http://www.debrice.com/building-a-simple-crawler/). It allows crawling from a URL and for a given number of bounce. ## Basic Usage from crawler import Crawler crawler = Crawler() crawler.crawl('http://techcrunch.com/') # displays the urls print crawler.content['techcrunch.com'].keys() ## Advanced Usage The following is using a cache (in sqlalchemy, `crawler.db`) and crawl to a depth of 3 from the home page. The `no_cache` parameter prevent '/' to be cached, enforcing new pull of the homepage each time the crawler is launched. import re from crawler import Crawler, CrawlerCache crawler = Crawler(CrawlerCache('crawler.db'), depth=3) crawler.crawl('http://techcrunch.com/', no_cache=re.compile('^/$').match) # displays the urls print crawler.content['techcrunch.com'].keys() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,138 @@ ```language-python # -*- coding: utf-8 -*- # filename: crawler.py import sqlite3 import urllib2 from HTMLParser import HTMLParser from urlparse import urlparse class HREFParser(HTMLParser): """ Parser that extracts hrefs """ hrefs = set() def handle_starttag(self, tag, attrs): if tag == 'a': dict_attrs = dict(attrs) if dict_attrs.get('href'): self.hrefs.add(dict_attrs['href']) def get_local_links(html, domain): """ Read through HTML content and returns a tuple of links internal to the given domain """ hrefs = set() parser = HREFParser() parser.feed(html) for href in parser.hrefs: # complete relative urls u_parse = urlparse(href) if href.startswith('/'): // purposefully using path, no query, no hash hrefs.add(u_parse.path) else: // only keep the local urls if u_parse.netloc == domain: hrefs.add(u_parse.path) return hrefs class CrawlerCache(object): def __init__(self, db_file): self.conn = sqlite3.connect(db_file) c = self.conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS sites (domain text, url text, content text)''') self.conn.commit() self.cursor = self.conn.cursor() def set(self, domain, url, data): """ store the content for a given domain and relative url """ self.cursor.execute("INSERT INTO sites VALUES (?,?,?)", (domain, url, data)) self.conn.commit() def get(self, domain, url): """ return the content for a given domain and relative url """ self.cursor.execute("SELECT * FROM sites WHERE domain=? and url=?", (domain, url)) row = self.cursor.fetchone() if row: return row[2] class Crawler(object): def __init__(self, cache=None, depth=2): """ depth: how many time it will bounce from page one (optional) cache: a basic cache controller (optional) """ self.depth = depth self.content = {} self.cache = cache def crawl(self, url, no_cache=None): """ url: where we start crawling, should be a complete URL like 'http://www.intel.com/news/' no_cache: function returning True if the url should be refreshed """ u_parse = urlparse(url) self.domain = u_parse.netloc self.content[self.domain] = {} self.scheme = u_parse.scheme self.no_cache = no_cache self._crawl([u_parse.path], self.depth) def set(self, url, html): self.content[self.domain][url] = html if self.is_cacheable(url): self.cache.set(self.domain, url, html) def get(self, url): page = None if self.is_cacheable(url): page = self.cache.get(self.domain, url) if page is None: page = self.curl(url) else: print "cached url... [%s] %s" % (self.domain, url) return page def is_cacheable(self, url): return self.cache and self.no_cache \ and not self.no_cache(url) def _crawl(self, urls, max_depth): n_urls = set() if max_depth: for url in urls: # do not crawl twice the same page if url not in self.content: html = self.get(url) self.set(url, html) n_urls = n_urls.union(get_local_links(html, self.domain)) self._crawl(n_urls, max_depth-1) def curl(self, url): """ return content at url. return empty string if response raise an HTTPError (not found, 500...) """ try: print "retrieving url... [%s] %s" % (self.domain, url) req = urllib2.Request('%s://%s%s' % (self.scheme, self.domain, url)) response = urllib2.urlopen(req) return response.read().decode('ascii', 'ignore') except urllib2.HTTPError, e: print "error [%s] %s: %s" % (self.domain, url, e) return '' This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,14 @@ #!/usr/bin/python # filename: run.py import re from crawler import Crawler, CrawlerCache if __name__ == "__main__": # Using SQLite as a cache to avoid pulling twice crawler = Crawler(CrawlerCache('crawler.db')) root_re = re.compile('^/$').match crawler.crawl('http://techcrunch.com/', no_cache=root_re) crawler.crawl('http://www.engadget.com/', no_cache=root_re) crawler.crawl('http://gizmodo.com/', no_cache=root_re) crawler.crawl('http://www.zdnet.com/', no_cache=root_re) crawler.crawl('http://www.wired.com/', no_cache=root_re)