Skip to content

Instantly share code, notes, and snippets.

@rvause
Created August 23, 2011 17:00
Show Gist options
  • Save rvause/1165861 to your computer and use it in GitHub Desktop.
Save rvause/1165861 to your computer and use it in GitHub Desktop.

Revisions

  1. rvause revised this gist Aug 23, 2011. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion fourdown.py
    Original file line number Diff line number Diff line change
    @@ -90,7 +90,6 @@ def get_images(self):
    print '%s Getting %s...' % (progress, filename)
    self._get_url(image, save_file)
    except Exception as error:
    print error
    print '%s Failed getting %s, we will get it next time' % (progress, image)
    time.sleep(self.download_delay)

  2. rvause revised this gist Aug 23, 2011. 1 changed file with 24 additions and 7 deletions.
    31 changes: 24 additions & 7 deletions fourdown.py
    Original file line number Diff line number Diff line change
    @@ -14,9 +14,8 @@
    '''


    import os.path as op
    from os import getcwd as cwd, makedirs
    from os import getcwd as cwd, makedirs, system
    import urllib2, urllib
    import re
    import time
    @@ -25,6 +24,7 @@

    USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0'
    REGEX_IMAGE = 'http://images\.4chan\.org/\w+/src/\d+\.(?:png|jpeg|jpg|gif)'
    WGET_PATH = '/usr/bin/wget'


    class FourDown(object):
    @@ -37,6 +37,8 @@ def __init__(self, url, *args, **kwargs):
    self.page_delay = kwargs.get('page_delay', 60)
    self.url = url
    self.save_to = kwargs.get('save_to', None)
    self.USE_WGET = kwargs.get('USE_WGET', False)
    self.wget_path = kwargs.get('wget_path', WGET_PATH)
    if self.save_to is None:
    self.save_to = cwd()
    else:
    @@ -53,9 +55,9 @@ def get_page(self):
    return self.page

    def _remove_dupes(self, items):
    # from here: http://code.activestate.com/recipes/52560/#c3
    set = {}
    map(set.__setitem__, items, [])
    return set.keys()
    return [set.setdefault(e,e) for e in items if e not in set]

    def _query_images(self):
    return self._remove_dupes(self.url_regex.findall(self.page))
    @@ -66,6 +68,12 @@ def _make_path(self):
    except OSError:
    pass

    def _get_url(self, image, save_file):
    if self.USE_WGET:
    system('%s %s -O %s' % (self.wget_path, image, save_file))
    else:
    urllib.urlretrieve(image, save_file)

    def get_images(self):
    self._make_path()
    images = self._query_images()
    @@ -80,8 +88,9 @@ def get_images(self):
    if not op.isfile(save_file):
    try:
    print '%s Getting %s...' % (progress, filename)
    urllib.urlretrieve(image, save_file)
    except:
    self._get_url(image, save_file)
    except Exception as error:
    print error
    print '%s Failed getting %s, we will get it next time' % (progress, image)
    time.sleep(self.download_delay)

    @@ -124,5 +133,13 @@ def start_loop(self):
    except IndexError:
    save_to = None

    f = FourDown(url, save_to=save_to)
    try:
    if sys.argv[3] == 'wget':
    use_wget = True
    else:
    use_wget = False
    except IndexError:
    use_wget = False

    f = FourDown(url, save_to=save_to, USE_WGET=use_wget)
    f.start_loop()
  3. rvause revised this gist Aug 23, 2011. 1 changed file with 14 additions and 3 deletions.
    17 changes: 14 additions & 3 deletions fourdown.py
    Original file line number Diff line number Diff line change
    @@ -14,6 +14,7 @@
    '''


    import os.path as op
    from os import getcwd as cwd, makedirs
    import urllib2, urllib
    @@ -51,8 +52,13 @@ def get_page(self):

    return self.page

    def _remove_dupes(self, items):
    set = {}
    map(set.__setitem__, items, [])
    return set.keys()

    def _query_images(self):
    return self.url_regex.findall(self.page)
    return self._remove_dupes(self.url_regex.findall(self.page))

    def _make_path(self):
    try:
    @@ -63,15 +69,20 @@ def _make_path(self):
    def get_images(self):
    self._make_path()
    images = self._query_images()
    total = len(images)
    counter = 0
    print '%d images in thread' % total
    for image in images:
    counter += 1
    progress = '[%d/%d]' % (counter, total)
    filename = ''.join(image.split('/')[-1:])
    save_file = op.join(self.save_to, filename)
    if not op.isfile(save_file):
    try:
    print 'Getting %s...' % filename
    print '%s Getting %s...' % (progress, filename)
    urllib.urlretrieve(image, save_file)
    except:
    print 'Failed getting %s, we will get it next time' % image
    print '%s Failed getting %s, we will get it next time' % (progress, image)
    time.sleep(self.download_delay)

    def start_loop(self):
  4. rvause created this gist Aug 23, 2011.
    117 changes: 117 additions & 0 deletions fourdown.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,117 @@
    #!/usr/bin/env python

    '''
    fourdown.py
    A simple script to grab links to images found on a page.
    You can use as is for downloading images from thread on 4chan or you can
    import FourDown and do what ever you want with it.
    Usage example:
    ./fourdown.py http://boards.4chan.org/hr/res/1382026 /path/to/hi/res/store
    '''

    import os.path as op
    from os import getcwd as cwd, makedirs
    import urllib2, urllib
    import re
    import time
    import sys


    USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:6.0) Gecko/20100101 Firefox/6.0'
    REGEX_IMAGE = 'http://images\.4chan\.org/\w+/src/\d+\.(?:png|jpeg|jpg|gif)'


    class FourDown(object):

    def __init__(self, url, *args, **kwargs):
    self.url_regex = re.compile(kwargs.get('regex', REGEX_IMAGE))
    self.user_agent = re.compile(kwargs.get('user_agent', USER_AGENT))
    self.retry_delay = kwargs.get('retry_delay', 60)
    self.download_delay = kwargs.get('download_delay', 5)
    self.page_delay = kwargs.get('page_delay', 60)
    self.url = url
    self.save_to = kwargs.get('save_to', None)
    if self.save_to is None:
    self.save_to = cwd()
    else:
    self.save_to = op.abspath(self.save_to)
    self.page = ''

    def get_page(self):
    request = urllib2.Request(self.url, None, {'User-agent': self.user_agent})

    response = urllib2.urlopen(request)

    self.page = response.read()

    return self.page

    def _query_images(self):
    return self.url_regex.findall(self.page)

    def _make_path(self):
    try:
    makedirs(self.save_to)
    except OSError:
    pass

    def get_images(self):
    self._make_path()
    images = self._query_images()
    for image in images:
    filename = ''.join(image.split('/')[-1:])
    save_file = op.join(self.save_to, filename)
    if not op.isfile(save_file):
    try:
    print 'Getting %s...' % filename
    urllib.urlretrieve(image, save_file)
    except:
    print 'Failed getting %s, we will get it next time' % image
    time.sleep(self.download_delay)

    def start_loop(self):
    print 'Using %s to store images' % self.save_to

    while True:
    try:
    print 'Getting page...'
    self.get_page()
    except urllib2.HTTPError as error:
    if error.code == 404:
    print '404: Stopping...'
    break
    else:
    print 'Error getting page will retry in %s seconds' % self.retry_delay
    time.sleep(self.retry_delay)
    continue
    except urllib2.URLError:
    print 'Error getting page, will retry in %s seconds' % self.retry_delay
    time.sleep(self.retry_delay)
    continue

    print 'Downloading images...'
    self.get_images()
    print 'Done for now, will check again in %s seconds' % self.page_delay
    time.sleep(self.page_delay)


    if __name__ == '__main__':

    try:
    url = sys.argv[1]
    except IndexError:
    print 'You must provide a url'
    sys.exit(1)

    try:
    save_to = sys.argv[2]
    except IndexError:
    save_to = None

    f = FourDown(url, save_to=save_to)
    f.start_loop()