""" Find the main image on a Wikipedia page and download it. Using a list of Wikipedia URLs, download the main image from each page in the list. Name the downloaded file to match the page URL. """ import requests, os # set the folder name where images will be stored my_folder = 'wiki_images' # create the folder in the current working directory # in which to store the downloaded images os.makedirs(my_folder, exist_ok=True) # front part of each Wikipedia URL base_url = 'https://en.wikipedia.org/wiki/' # partial URLs for each desired Wikipedia page my_list = ['Anaea_troglodyta', 'Colias_eurytheme', 'Euphilotes_battoides', 'Great_spangled_fritillary', 'Papilio_troilus'] # Wikipedia API query string to get the main image on a page # (partial URL will be added to the end) query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' # get JSON data w/ API and extract image URL def get_image_url(partial_url): try: api_res = requests.get(query + partial_url).json() first_part = api_res['query']['pages'] # this is a way around not knowing the article id number for key, value in first_part.items(): if (value['original']['source']): data = value['original']['source'] return data except Exception as exc: print(exc) print("Partial URL: " + partial_url) data = None return data # download one image with URL obtained from API def download_image(the_url, the_page): res = requests.get(the_url) res.raise_for_status() # get original file extension for image # by splitting on . and getting the final segment file_ext = '.' + the_url.split('.')[-1].lower() # save the image to folder - binary file - with desired filename image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') # download the image file # HT to Automate the Boring Stuff with Python, chapter 12 for chunk in res.iter_content(100000): image_file.write(chunk) image_file.close() # loop to download main image for each page in list counter = 1 for the_page in my_list: # get JSON data and extract image URL the_url = get_image_url(the_page) # if the URL is not None ... if (the_url): # tell us where we are for the heck of it print("Downloading image " + str(counter)) # download that image download_image(the_url, the_page) else: print("No image file for " + the_page) counter += 1 print("All done!")