Last active
March 25, 2025 21:23
-
-
Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.
Revisions
-
macloo revised this gist
Apr 6, 2021 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -55,7 +55,8 @@ def download_image(the_url, the_page): # save the image to folder - binary file - with desired filename image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') # download the image file # HT to Automate the Boring Stuff with Python, chapter 12 for chunk in res.iter_content(100000): image_file.write(chunk) image_file.close() -
macloo created this gist
Apr 6, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,78 @@ """ Find the main image on a Wikipedia page and download it. Using a list of Wikipedia URLs, download the main image from each page in the list. Name the downloaded file to match the page URL. """ import requests, os # set the folder name where images will be stored my_folder = 'wiki_images' # create the folder in the current working directory # in which to store the downloaded images os.makedirs(my_folder, exist_ok=True) # front part of each Wikipedia URL base_url = 'https://en.wikipedia.org/wiki/' # partial URLs for each desired Wikipedia page my_list = ['Anaea_troglodyta', 'Colias_eurytheme', 'Euphilotes_battoides', 'Great_spangled_fritillary', 'Papilio_troilus'] # Wikipedia API query string to get the main image on a page # (partial URL will be added to the end) query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' # get JSON data w/ API and extract image URL def get_image_url(partial_url): try: api_res = requests.get(query + partial_url).json() first_part = api_res['query']['pages'] # this is a way around not knowing the article id number for key, value in first_part.items(): if (value['original']['source']): data = value['original']['source'] return data except Exception as exc: print(exc) print("Partial URL: " + partial_url) data = None return data # download one image with URL obtained from API def download_image(the_url, the_page): res = requests.get(the_url) res.raise_for_status() # get original file extension for image # by splitting on . and getting the final segment file_ext = '.' + the_url.split('.')[-1].lower() # save the image to folder - binary file - with desired filename image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') # download the image file for chunk in res.iter_content(100000): image_file.write(chunk) image_file.close() # loop to download main image for each page in list counter = 1 for the_page in my_list: # get JSON data and extract image URL the_url = get_image_url(the_page) # if the URL is not None ... if (the_url): # tell us where we are for the heck of it print("Downloading image " + str(counter)) # download that image download_image(the_url, the_page) else: print("No image file for " + the_page) counter += 1 print("All done!")