Skip to content

Instantly share code, notes, and snippets.

@macloo
Last active March 25, 2025 21:23
Show Gist options
  • Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.
Save macloo/3c85055d2c9ab6293cce582f480312b6 to your computer and use it in GitHub Desktop.

Revisions

  1. macloo revised this gist Apr 6, 2021. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion download_images.py
    Original file line number Diff line number Diff line change
    @@ -55,7 +55,8 @@ def download_image(the_url, the_page):
    # save the image to folder - binary file - with desired filename
    image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')

    # download the image file
    # download the image file
    # HT to Automate the Boring Stuff with Python, chapter 12
    for chunk in res.iter_content(100000):
    image_file.write(chunk)
    image_file.close()
  2. macloo created this gist Apr 6, 2021.
    78 changes: 78 additions & 0 deletions download_images.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,78 @@
    """
    Find the main image on a Wikipedia page and download it.
    Using a list of Wikipedia URLs, download the main image from each page in the list.
    Name the downloaded file to match the page URL.
    """

    import requests, os

    # set the folder name where images will be stored
    my_folder = 'wiki_images'

    # create the folder in the current working directory
    # in which to store the downloaded images
    os.makedirs(my_folder, exist_ok=True)

    # front part of each Wikipedia URL
    base_url = 'https://en.wikipedia.org/wiki/'

    # partial URLs for each desired Wikipedia page
    my_list = ['Anaea_troglodyta',
    'Colias_eurytheme',
    'Euphilotes_battoides',
    'Great_spangled_fritillary',
    'Papilio_troilus']

    # Wikipedia API query string to get the main image on a page
    # (partial URL will be added to the end)
    query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

    # get JSON data w/ API and extract image URL
    def get_image_url(partial_url):
    try:
    api_res = requests.get(query + partial_url).json()
    first_part = api_res['query']['pages']
    # this is a way around not knowing the article id number
    for key, value in first_part.items():
    if (value['original']['source']):
    data = value['original']['source']
    return data
    except Exception as exc:
    print(exc)
    print("Partial URL: " + partial_url)
    data = None
    return data

    # download one image with URL obtained from API
    def download_image(the_url, the_page):
    res = requests.get(the_url)
    res.raise_for_status()

    # get original file extension for image
    # by splitting on . and getting the final segment
    file_ext = '.' + the_url.split('.')[-1].lower()

    # save the image to folder - binary file - with desired filename
    image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')

    # download the image file
    for chunk in res.iter_content(100000):
    image_file.write(chunk)
    image_file.close()

    # loop to download main image for each page in list
    counter = 1
    for the_page in my_list:
    # get JSON data and extract image URL
    the_url = get_image_url(the_page)
    # if the URL is not None ...
    if (the_url):
    # tell us where we are for the heck of it
    print("Downloading image " + str(counter))
    # download that image
    download_image(the_url, the_page)
    else:
    print("No image file for " + the_page)
    counter += 1

    print("All done!")