-
-
Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.
| """ | |
| Find the main image on a Wikipedia page and download it. | |
| Using a list of Wikipedia URLs, download the main image from each page in the list. | |
| Name the downloaded file to match the page URL. | |
| """ | |
| import requests, os | |
| # set the folder name where images will be stored | |
| my_folder = 'wiki_images' | |
| # create the folder in the current working directory | |
| # in which to store the downloaded images | |
| os.makedirs(my_folder, exist_ok=True) | |
| # front part of each Wikipedia URL | |
| base_url = 'https://en.wikipedia.org/wiki/' | |
| # partial URLs for each desired Wikipedia page | |
| my_list = ['Anaea_troglodyta', | |
| 'Colias_eurytheme', | |
| 'Euphilotes_battoides', | |
| 'Great_spangled_fritillary', | |
| 'Papilio_troilus'] | |
| # Wikipedia API query string to get the main image on a page | |
| # (partial URL will be added to the end) | |
| query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles=' | |
| # get JSON data w/ API and extract image URL | |
| def get_image_url(partial_url): | |
| try: | |
| api_res = requests.get(query + partial_url).json() | |
| first_part = api_res['query']['pages'] | |
| # this is a way around not knowing the article id number | |
| for key, value in first_part.items(): | |
| if (value['original']['source']): | |
| data = value['original']['source'] | |
| return data | |
| except Exception as exc: | |
| print(exc) | |
| print("Partial URL: " + partial_url) | |
| data = None | |
| return data | |
| # download one image with URL obtained from API | |
| def download_image(the_url, the_page): | |
| res = requests.get(the_url) | |
| res.raise_for_status() | |
| # get original file extension for image | |
| # by splitting on . and getting the final segment | |
| file_ext = '.' + the_url.split('.')[-1].lower() | |
| # save the image to folder - binary file - with desired filename | |
| image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb') | |
| # download the image file | |
| # HT to Automate the Boring Stuff with Python, chapter 12 | |
| for chunk in res.iter_content(100000): | |
| image_file.write(chunk) | |
| image_file.close() | |
| # loop to download main image for each page in list | |
| counter = 1 | |
| for the_page in my_list: | |
| # get JSON data and extract image URL | |
| the_url = get_image_url(the_page) | |
| # if the URL is not None ... | |
| if (the_url): | |
| # tell us where we are for the heck of it | |
| print("Downloading image " + str(counter)) | |
| # download that image | |
| download_image(the_url, the_page) | |
| else: | |
| print("No image file for " + the_page) | |
| counter += 1 | |
| print("All done!") |
@nuganics I'll bet adding the header shown there will fix the script. I will try it when I have time. Thanks for pointing out the error.
@macloo Thank you for the reply. I don't know Python but I just read your Python course chapters 1 to 3. I added headers = {'User-Agent': 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])'} to line 18 under the base_url line. Line 49 res = requests.get(the_url) I changed to res = requests.get(the_url, headers=headers)
It seems to work.
I learnt Pascal in school and know Excel a little but got there with your tutorial so thank you :)
I used a Chrome extension (Link Klipper) to get wikipedia URLs then Excel to extract the pages with =TRIM(RIGHT(SUBSTITUTE(A1,"/",REPT(" ",255)),255)) and ="'"&TEXTJOIN("', '",1,F1:F1000)&"'" to join them for use in your script.
@nuganics You shouldn't use 'CoolBot/0.0 (https://example.org/coolbot/; [email protected])' but instead edit that to identify yourself. It is a way of saying: "Hi, I am not evil. Here's my identification." That is explained at https://meta.wikimedia.org/wiki/User-Agent_policy#Python — but maybe not explained clearly enough
Im getting a blocked 403 agent as needs to comply with https://meta.wikimedia.org/wiki/User-Agent_policy#Python now I guess?