"""
  Find the main image on a Wikipedia page and download it.
  Using a list of Wikipedia URLs, download the main image from each page in the list.
  Name the downloaded file to match the page URL.
"""

import requests, os

# set the folder name where images will be stored
my_folder = 'wiki_images'

# create the folder in the current working directory
# in which to store the downloaded images
os.makedirs(my_folder, exist_ok=True)

# front part of each Wikipedia URL
base_url = 'https://en.wikipedia.org/wiki/'

# partial URLs for each desired Wikipedia page
my_list = ['Anaea_troglodyta',
    'Colias_eurytheme',
    'Euphilotes_battoides',
    'Great_spangled_fritillary',
    'Papilio_troilus']

# Wikipedia API query string to get the main image on a page
# (partial URL will be added to the end)
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# get JSON data w/ API and extract image URL
def get_image_url(partial_url):
    try:
        api_res = requests.get(query + partial_url).json()
        first_part = api_res['query']['pages']
        # this is a way around not knowing the article id number
        for key, value in first_part.items():
            if (value['original']['source']):
                data = value['original']['source']
                return data
    except Exception as exc:
        print(exc)
        print("Partial URL: " + partial_url)
        data = None
    return data

# download one image with URL obtained from API
def download_image(the_url, the_page):
    res = requests.get(the_url)
    res.raise_for_status()

    # get original file extension for image
    # by splitting on . and getting the final segment
    file_ext = '.' + the_url.split('.')[-1].lower()

    # save the image to folder - binary file - with desired filename
    image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')

    # download the image file 
    # HT to Automate the Boring Stuff with Python, chapter 12 
    for chunk in res.iter_content(100000):
        image_file.write(chunk)
    image_file.close()

# loop to download main image for each page in list
counter = 1
for the_page in my_list:
    # get JSON data and extract image URL
    the_url = get_image_url(the_page)
    # if the URL is not None ...
    if (the_url):
        # tell us where we are for the heck of it
        print("Downloading image " + str(counter))
        # download that image
        download_image(the_url, the_page)
    else:
        print("No image file for " + the_page)
    counter += 1

print("All done!")