macloo · March 25, 2025 21:23 · Apr 6, 2021 · Apr 6, 2021
diff --git a/download_images.py b/download_images.py
@@ -55,7 +55,8 @@ def download_image(the_url, the_page):
     # save the image to folder - binary file - with desired filename
     image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')
 
-    # download the image file
+    # download the image file 
+    # HT to Automate the Boring Stuff with Python, chapter 12 
     for chunk in res.iter_content(100000):
         image_file.write(chunk)
     image_file.close()

diff --git a/download_images.py b/download_images.py
@@ -0,0 +1,78 @@
+"""
+  Find the main image on a Wikipedia page and download it.
+  Using a list of Wikipedia URLs, download the main image from each page in the list.
+  Name the downloaded file to match the page URL.
+"""
+
+import requests, os
+
+# set the folder name where images will be stored
+my_folder = 'wiki_images'
+
+# create the folder in the current working directory
+# in which to store the downloaded images
+os.makedirs(my_folder, exist_ok=True)
+
+# front part of each Wikipedia URL
+base_url = 'https://en.wikipedia.org/wiki/'
+
+# partial URLs for each desired Wikipedia page
+my_list = ['Anaea_troglodyta',
+    'Colias_eurytheme',
+    'Euphilotes_battoides',
+    'Great_spangled_fritillary',
+    'Papilio_troilus']
+
+# Wikipedia API query string to get the main image on a page
+# (partial URL will be added to the end)
+query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='
+
+# get JSON data w/ API and extract image URL
+def get_image_url(partial_url):
+    try:
+        api_res = requests.get(query + partial_url).json()
+        first_part = api_res['query']['pages']
+        # this is a way around not knowing the article id number
+        for key, value in first_part.items():
+            if (value['original']['source']):
+                data = value['original']['source']
+                return data
+    except Exception as exc:
+        print(exc)
+        print("Partial URL: " + partial_url)
+        data = None
+    return data
+
+# download one image with URL obtained from API
+def download_image(the_url, the_page):
+    res = requests.get(the_url)
+    res.raise_for_status()
+
+    # get original file extension for image
+    # by splitting on . and getting the final segment
+    file_ext = '.' + the_url.split('.')[-1].lower()
+
+    # save the image to folder - binary file - with desired filename
+    image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')
+
+    # download the image file
+    for chunk in res.iter_content(100000):
+        image_file.write(chunk)
+    image_file.close()
+
+# loop to download main image for each page in list
+counter = 1
+for the_page in my_list:
+    # get JSON data and extract image URL
+    the_url = get_image_url(the_page)
+    # if the URL is not None ...
+    if (the_url):
+        # tell us where we are for the heck of it
+        print("Downloading image " + str(counter))
+        # download that image
+        download_image(the_url, the_page)
+    else:
+        print("No image file for " + the_page)
+    counter += 1
+
+print("All done!")