crizCraig · May 27, 2012 22:52 · May 27, 2012 · May 27, 2012 · May 27, 2012 · May 27, 2012
diff --git a/gistfile1.py b/gistfile1.py
@@ -51,4 +51,4 @@ def go(query, path):
     time.sleep(1.5)
 
 # Example use
-go('landscape', 'negative_examples')
+go('landscape', 'myDirectory')
diff --git a/gistfile1.py b/gistfile1.py
@@ -20,33 +20,34 @@ def go(query, path):
   if not os.path.exists(BASE_PATH):
     os.makedirs(BASE_PATH)
 
-  start = 0 # Start query string parameter for pagination.
-  while start < 60: # Google returns a max of 56 results.
+  start = 0 # Google's start query string parameter for pagination.
+  while start < 60: # Google will only return a max of 56 results.
     r = requests.get(BASE_URL % start)
     for image_info in json.loads(r.text)['responseData']['results']:
+      url = image_info['unescapedUrl']
       try:
-        image_r = requests.get(image_info['unescapedUrl'])
+        image_r = requests.get(url)
       except ConnectionError, e:
-        print 'could not download %s' % image_info['url']
+        print 'could not download %s' % url
         continue
 
-      # Remove file system path characters from name.
+      # Remove file-system path characters from name.
       title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
 
       file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
       try:
         Image.open(StringIO(image_r.content)).save(file, 'JPEG')
       except IOError, e:
-        # This usually throws away some gifs. But who cares about gifs.
-        print 'could not save %s' % image_info['url']
+        # Throw away some gifs...blegh.
+        print 'could not save %s' % url
         continue
       finally:
         file.close()
 
     print start
-    start += 4 # Four images are returned per page.
+    start += 4 # 4 images per page.
 
-    # Be nice to Google and they'll be nice to you :)
+    # Be nice to Google and they'll be nice back :)
     time.sleep(1.5)
 
 # Example use

diff --git a/gistfile1.py b/gistfile1.py
@@ -7,7 +7,11 @@
 from requests.exceptions import ConnectionError
 
 def go(query, path):
-  """Download full size images from Google image search."""
+  """Download full size images from Google image search.
+
+  Don't print or republish images without permission.
+  I used this to train a learning algorithm.
+  """
   BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
              'v=1.0&q=' + query + '&start=%d'
 
@@ -16,27 +20,34 @@ def go(query, path):
   if not os.path.exists(BASE_PATH):
     os.makedirs(BASE_PATH)
 
-  BASE_PATH = os.path.join(BASE_PATH, '%s.jpg')
-
-  start = 0
-  while start < 60:
+  start = 0 # Start query string parameter for pagination.
+  while start < 60: # Google returns a max of 56 results.
     r = requests.get(BASE_URL % start)
-    image_json = json.loads(r.text)
-    image_infos = json.loads(r.text)['responseData']['results']
-    for image_info in image_infos:
+    for image_info in json.loads(r.text)['responseData']['results']:
       try:
         image_r = requests.get(image_info['unescapedUrl'])
       except ConnectionError, e:
         print 'could not download %s' % image_info['url']
+        continue
 
+      # Remove file system path characters from name.
       title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
-      file = open(BASE_PATH % title, 'w')
+
+      file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
       try:
         Image.open(StringIO(image_r.content)).save(file, 'JPEG')
       except IOError, e:
+        # This usually throws away some gifs. But who cares about gifs.
         print 'could not save %s' % image_info['url']
+        continue
+      finally:
+        file.close()
+
     print start
-    start += 4
+    start += 4 # Four images are returned per page.
+
+    # Be nice to Google and they'll be nice to you :)
     time.sleep(1.5)
 
-go('landscapes', 'negative_examples')
+# Example use
+go('landscape', 'negative_examples')
diff --git a/gistfile1.txt → gistfile1.py b/gistfile1.txt → gistfile1.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,42 @@
+import json
+import os
+import time
+import requests
+from PIL import Image
+from StringIO import StringIO
+from requests.exceptions import ConnectionError
+
+def go(query, path):
+  """Download full size images from Google image search."""
+  BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
+             'v=1.0&q=' + query + '&start=%d'
+
+  BASE_PATH = os.path.join(path, query)
+
+  if not os.path.exists(BASE_PATH):
+    os.makedirs(BASE_PATH)
+
+  BASE_PATH = os.path.join(BASE_PATH, '%s.jpg')
+
+  start = 0
+  while start < 60:
+    r = requests.get(BASE_URL % start)
+    image_json = json.loads(r.text)
+    image_infos = json.loads(r.text)['responseData']['results']
+    for image_info in image_infos:
+      try:
+        image_r = requests.get(image_info['unescapedUrl'])
+      except ConnectionError, e:
+        print 'could not download %s' % image_info['url']
+
+      title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
+      file = open(BASE_PATH % title, 'w')
+      try:
+        Image.open(StringIO(image_r.content)).save(file, 'JPEG')
+      except IOError, e:
+        print 'could not save %s' % image_info['url']
+    print start
+    start += 4
+    time.sleep(1.5)
+
+go('landscapes', 'negative_examples')
No results found