Skip to content

Instantly share code, notes, and snippets.

@yongjin-shin
Forked from typehorror/crawler.md
Created October 2, 2017 10:47
Show Gist options
  • Save yongjin-shin/6d1eb3ab4b234183823f93c90d749ce6 to your computer and use it in GitHub Desktop.
Save yongjin-shin/6d1eb3ab4b234183823f93c90d749ce6 to your computer and use it in GitHub Desktop.

Revisions

  1. @typehorror typehorror revised this gist Aug 4, 2014. 1 changed file with 5 additions and 4 deletions.
    9 changes: 5 additions & 4 deletions crawler.py
    Original file line number Diff line number Diff line change
    @@ -74,10 +74,11 @@ def get_urls(self, domain):
    return all the URLS within a domain
    """
    self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,))
    row = self.cursor.fetchone()
    while row:
    yield row[0]
    row = self.cursor.fetchone()
    # could use fetchone and yield but I want to release
    # my cursor after the call. I could have create a new cursor tho.
    # ...Oh well
    return [row[0] for row in self.cursor.fetchall()]


    class Crawler(object):
    def __init__(self, cache=None, depth=2):
  2. @typehorror typehorror revised this gist Aug 3, 2014. 1 changed file with 21 additions and 10 deletions.
    31 changes: 21 additions & 10 deletions crawler.py
    Original file line number Diff line number Diff line change
    @@ -1,13 +1,13 @@
    # -*- coding: utf-8 -*-
    # filename: crawler.py

    import sqlite3
    import urllib2
    from HTMLParser import HTMLParser
    import sqlite3
    import urllib2
    from HTMLParser import HTMLParser
    from urlparse import urlparse


    class HREFParser(HTMLParser):
    class HREFParser(HTMLParser):
    """
    Parser that extracts hrefs
    """
    @@ -19,7 +19,7 @@ def handle_starttag(self, tag, attrs):
    self.hrefs.add(dict_attrs['href'])


    def get_local_links(html, domain):
    def get_local_links(html, domain):
    """
    Read through HTML content and returns a tuple of links
    internal to the given domain
    @@ -39,7 +39,10 @@ def get_local_links(html, domain):
    return hrefs


    class CrawlerCache(object):
    class CrawlerCache(object):
    """
    Crawler data caching per relative URL and domain.
    """
    def __init__(self, db_file):
    self.conn = sqlite3.connect(db_file)
    c = self.conn.cursor()
    @@ -60,15 +63,23 @@ def get(self, domain, url):
    """
    return the content for a given domain and relative url
    """
    self.cursor.execute("SELECT * FROM sites WHERE domain=? and url=?",
    self.cursor.execute("SELECT content FROM sites WHERE domain=? and url=?",
    (domain, url))
    row = self.cursor.fetchone()
    if row:
    return row[2]

    return row[0]

    class Crawler(object):
    def get_urls(self, domain):
    """
    return all the URLS within a domain
    """
    self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,))
    row = self.cursor.fetchone()
    while row:
    yield row[0]
    row = self.cursor.fetchone()

    class Crawler(object):
    def __init__(self, cache=None, depth=2):
    """
    depth: how many time it will bounce from page one (optional)
  3. @typehorror typehorror revised this gist Aug 3, 2014. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion crawler.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@
    ```language-python
    # -*- coding: utf-8 -*-
    # filename: crawler.py

  4. @typehorror typehorror revised this gist Aug 3, 2014. 1 changed file with 2 additions and 3 deletions.
    5 changes: 2 additions & 3 deletions crawler.py
    Original file line number Diff line number Diff line change
    @@ -29,13 +29,12 @@ def get_local_links(html, domain):
    parser = HREFParser()
    parser.feed(html)
    for href in parser.hrefs:
    # complete relative urls
    u_parse = urlparse(href)
    if href.startswith('/'):
    // purposefully using path, no query, no hash
    # purposefully using path, no query, no hash
    hrefs.add(u_parse.path)
    else:
    // only keep the local urls
    # only keep the local urls
    if u_parse.netloc == domain:
    hrefs.add(u_parse.path)
    return hrefs
  5. @typehorror typehorror revised this gist Aug 3, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion crawler.md
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    # Simple Python Website Crawler
    # Simple Website Crawler

    The following gist is an extract of the article [Building a simple crawler](http://www.debrice.com/building-a-simple-crawler/). It allows crawling from a URL and for a given number of bounce.

  6. @typehorror typehorror created this gist Aug 3, 2014.
    22 changes: 22 additions & 0 deletions crawler.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,22 @@
    # Simple Python Website Crawler

    The following gist is an extract of the article [Building a simple crawler](http://www.debrice.com/building-a-simple-crawler/). It allows crawling from a URL and for a given number of bounce.

    ## Basic Usage

    from crawler import Crawler
    crawler = Crawler()
    crawler.crawl('http://techcrunch.com/')
    # displays the urls
    print crawler.content['techcrunch.com'].keys()

    ## Advanced Usage

    The following is using a cache (in sqlalchemy, `crawler.db`) and crawl to a depth of 3 from the home page. The `no_cache` parameter prevent '/' to be cached, enforcing new pull of the homepage each time the crawler is launched.

    import re
    from crawler import Crawler, CrawlerCache
    crawler = Crawler(CrawlerCache('crawler.db'), depth=3)
    crawler.crawl('http://techcrunch.com/', no_cache=re.compile('^/$').match)
    # displays the urls
    print crawler.content['techcrunch.com'].keys()
    138 changes: 138 additions & 0 deletions crawler.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,138 @@
    ```language-python
    # -*- coding: utf-8 -*-
    # filename: crawler.py

    import sqlite3
    import urllib2
    from HTMLParser import HTMLParser
    from urlparse import urlparse


    class HREFParser(HTMLParser):
    """
    Parser that extracts hrefs
    """
    hrefs = set()
    def handle_starttag(self, tag, attrs):
    if tag == 'a':
    dict_attrs = dict(attrs)
    if dict_attrs.get('href'):
    self.hrefs.add(dict_attrs['href'])


    def get_local_links(html, domain):
    """
    Read through HTML content and returns a tuple of links
    internal to the given domain
    """
    hrefs = set()
    parser = HREFParser()
    parser.feed(html)
    for href in parser.hrefs:
    # complete relative urls
    u_parse = urlparse(href)
    if href.startswith('/'):
    // purposefully using path, no query, no hash
    hrefs.add(u_parse.path)
    else:
    // only keep the local urls
    if u_parse.netloc == domain:
    hrefs.add(u_parse.path)
    return hrefs


    class CrawlerCache(object):
    def __init__(self, db_file):
    self.conn = sqlite3.connect(db_file)
    c = self.conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS sites
    (domain text, url text, content text)''')
    self.conn.commit()
    self.cursor = self.conn.cursor()

    def set(self, domain, url, data):
    """
    store the content for a given domain and relative url
    """
    self.cursor.execute("INSERT INTO sites VALUES (?,?,?)",
    (domain, url, data))
    self.conn.commit()

    def get(self, domain, url):
    """
    return the content for a given domain and relative url
    """
    self.cursor.execute("SELECT * FROM sites WHERE domain=? and url=?",
    (domain, url))
    row = self.cursor.fetchone()
    if row:
    return row[2]


    class Crawler(object):

    def __init__(self, cache=None, depth=2):
    """
    depth: how many time it will bounce from page one (optional)
    cache: a basic cache controller (optional)
    """
    self.depth = depth
    self.content = {}
    self.cache = cache

    def crawl(self, url, no_cache=None):
    """
    url: where we start crawling, should be a complete URL like
    'http://www.intel.com/news/'
    no_cache: function returning True if the url should be refreshed
    """
    u_parse = urlparse(url)
    self.domain = u_parse.netloc
    self.content[self.domain] = {}
    self.scheme = u_parse.scheme
    self.no_cache = no_cache
    self._crawl([u_parse.path], self.depth)

    def set(self, url, html):
    self.content[self.domain][url] = html
    if self.is_cacheable(url):
    self.cache.set(self.domain, url, html)

    def get(self, url):
    page = None
    if self.is_cacheable(url):
    page = self.cache.get(self.domain, url)
    if page is None:
    page = self.curl(url)
    else:
    print "cached url... [%s] %s" % (self.domain, url)
    return page

    def is_cacheable(self, url):
    return self.cache and self.no_cache \
    and not self.no_cache(url)

    def _crawl(self, urls, max_depth):
    n_urls = set()
    if max_depth:
    for url in urls:
    # do not crawl twice the same page
    if url not in self.content:
    html = self.get(url)
    self.set(url, html)
    n_urls = n_urls.union(get_local_links(html, self.domain))
    self._crawl(n_urls, max_depth-1)

    def curl(self, url):
    """
    return content at url.
    return empty string if response raise an HTTPError (not found, 500...)
    """
    try:
    print "retrieving url... [%s] %s" % (self.domain, url)
    req = urllib2.Request('%s://%s%s' % (self.scheme, self.domain, url))
    response = urllib2.urlopen(req)
    return response.read().decode('ascii', 'ignore')
    except urllib2.HTTPError, e:
    print "error [%s] %s: %s" % (self.domain, url, e)
    return ''
    14 changes: 14 additions & 0 deletions run.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,14 @@
    #!/usr/bin/python
    # filename: run.py
    import re
    from crawler import Crawler, CrawlerCache

    if __name__ == "__main__":
    # Using SQLite as a cache to avoid pulling twice
    crawler = Crawler(CrawlerCache('crawler.db'))
    root_re = re.compile('^/$').match
    crawler.crawl('http://techcrunch.com/', no_cache=root_re)
    crawler.crawl('http://www.engadget.com/', no_cache=root_re)
    crawler.crawl('http://gizmodo.com/', no_cache=root_re)
    crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
    crawler.crawl('http://www.wired.com/', no_cache=root_re)