Skip to content

Instantly share code, notes, and snippets.

@crizCraig
Created May 27, 2012 22:52
Show Gist options
  • Select an option

  • Save crizCraig/2816295 to your computer and use it in GitHub Desktop.

Select an option

Save crizCraig/2816295 to your computer and use it in GitHub Desktop.

Revisions

  1. crizCraig revised this gist May 27, 2012. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -51,4 +51,4 @@ def go(query, path):
    time.sleep(1.5)

    # Example use
    go('landscape', 'negative_examples')
    go('landscape', 'myDirectory')
  2. crizCraig revised this gist May 27, 2012. 1 changed file with 10 additions and 9 deletions.
    19 changes: 10 additions & 9 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -20,33 +20,34 @@ def go(query, path):
    if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)

    start = 0 # Start query string parameter for pagination.
    while start < 60: # Google returns a max of 56 results.
    start = 0 # Google's start query string parameter for pagination.
    while start < 60: # Google will only return a max of 56 results.
    r = requests.get(BASE_URL % start)
    for image_info in json.loads(r.text)['responseData']['results']:
    url = image_info['unescapedUrl']
    try:
    image_r = requests.get(image_info['unescapedUrl'])
    image_r = requests.get(url)
    except ConnectionError, e:
    print 'could not download %s' % image_info['url']
    print 'could not download %s' % url
    continue

    # Remove file system path characters from name.
    # Remove file-system path characters from name.
    title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')

    file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
    try:
    Image.open(StringIO(image_r.content)).save(file, 'JPEG')
    except IOError, e:
    # This usually throws away some gifs. But who cares about gifs.
    print 'could not save %s' % image_info['url']
    # Throw away some gifs...blegh.
    print 'could not save %s' % url
    continue
    finally:
    file.close()

    print start
    start += 4 # Four images are returned per page.
    start += 4 # 4 images per page.

    # Be nice to Google and they'll be nice to you :)
    # Be nice to Google and they'll be nice back :)
    time.sleep(1.5)

    # Example use
  3. crizCraig revised this gist May 27, 2012. 1 changed file with 22 additions and 11 deletions.
    33 changes: 22 additions & 11 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,11 @@
    from requests.exceptions import ConnectionError

    def go(query, path):
    """Download full size images from Google image search."""
    """Download full size images from Google image search.
    Don't print or republish images without permission.
    I used this to train a learning algorithm.
    """
    BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
    'v=1.0&q=' + query + '&start=%d'

    @@ -16,27 +20,34 @@ def go(query, path):
    if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)

    BASE_PATH = os.path.join(BASE_PATH, '%s.jpg')

    start = 0
    while start < 60:
    start = 0 # Start query string parameter for pagination.
    while start < 60: # Google returns a max of 56 results.
    r = requests.get(BASE_URL % start)
    image_json = json.loads(r.text)
    image_infos = json.loads(r.text)['responseData']['results']
    for image_info in image_infos:
    for image_info in json.loads(r.text)['responseData']['results']:
    try:
    image_r = requests.get(image_info['unescapedUrl'])
    except ConnectionError, e:
    print 'could not download %s' % image_info['url']
    continue

    # Remove file system path characters from name.
    title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
    file = open(BASE_PATH % title, 'w')

    file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
    try:
    Image.open(StringIO(image_r.content)).save(file, 'JPEG')
    except IOError, e:
    # This usually throws away some gifs. But who cares about gifs.
    print 'could not save %s' % image_info['url']
    continue
    finally:
    file.close()

    print start
    start += 4
    start += 4 # Four images are returned per page.

    # Be nice to Google and they'll be nice to you :)
    time.sleep(1.5)

    go('landscapes', 'negative_examples')
    # Example use
    go('landscape', 'negative_examples')
  4. crizCraig renamed this gist May 27, 2012. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  5. crizCraig created this gist May 27, 2012.
    42 changes: 42 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    import json
    import os
    import time
    import requests
    from PIL import Image
    from StringIO import StringIO
    from requests.exceptions import ConnectionError

    def go(query, path):
    """Download full size images from Google image search."""
    BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
    'v=1.0&q=' + query + '&start=%d'

    BASE_PATH = os.path.join(path, query)

    if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)

    BASE_PATH = os.path.join(BASE_PATH, '%s.jpg')

    start = 0
    while start < 60:
    r = requests.get(BASE_URL % start)
    image_json = json.loads(r.text)
    image_infos = json.loads(r.text)['responseData']['results']
    for image_info in image_infos:
    try:
    image_r = requests.get(image_info['unescapedUrl'])
    except ConnectionError, e:
    print 'could not download %s' % image_info['url']

    title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
    file = open(BASE_PATH % title, 'w')
    try:
    Image.open(StringIO(image_r.content)).save(file, 'JPEG')
    except IOError, e:
    print 'could not save %s' % image_info['url']
    print start
    start += 4
    time.sleep(1.5)

    go('landscapes', 'negative_examples')