Skip to content

Instantly share code, notes, and snippets.

@mstevenson
Last active November 25, 2022 00:47
Show Gist options
  • Select an option

  • Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.

Select an option

Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.

Revisions

  1. mstevenson revised this gist Nov 25, 2022. 1 changed file with 16 additions and 10 deletions.
    26 changes: 16 additions & 10 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -17,26 +17,32 @@ def download_item(item_id, dir):
    item = ia.get_item(item_id, config=config)
    meta = item.metadata
    title = meta['title']
    if isinstance(title, list): # not sure why a list is sometimes returned, so just move on
    return
    description = meta.get('description', None)
    formats = ['JPEG']
    for file in item.files:
    if file['format'] in formats:
    filename = file['name']
    item.download(files=filename, \
    formats=file['format'], \
    destdir=dir,
    no_directory=True, \
    verbose=True, \
    ignore_existing=True, \
    retries=2)
    caption = dir / Path(filename).with_suffix(".txt")
    with open(caption, 'w') as f:
    try:
    item.download(files=filename, \
    formats=file['format'], \
    destdir=dir,
    no_directory=True, \
    verbose=True, \
    ignore_existing=True, \
    retries=2)
    except Exception as e:
    print(f'Download failed: {e}')
    return
    caption = dir / Path(filename).with_suffix('.txt')
    with open(caption, 'w', encoding='utf-8') as f:
    if description and not isinstance(description, list):
    f.write(description)
    else:
    f.write(title)
    return
    print(f"no image for {item_id}")
    print(f'no image for {item_id}')

    if __name__ == '__main__':
    parser = argparse.ArgumentParser()
  2. mstevenson revised this gist Nov 24, 2022. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -11,7 +11,7 @@ def download_collection(collection_name, output_dir):
    dir.mkdir(exist_ok=True)
    for result in search:
    download_item(result['identifier'], dir)
    time.sleep(0.2)
    time.sleep(0.1)

    def download_item(item_id, dir):
    item = ia.get_item(item_id, config=config)
    @@ -31,7 +31,7 @@ def download_item(item_id, dir):
    retries=2)
    caption = dir / Path(filename).with_suffix(".txt")
    with open(caption, 'w') as f:
    if description:
    if description and not isinstance(description, list):
    f.write(description)
    else:
    f.write(title)
  3. mstevenson revised this gist Nov 24, 2022. 1 changed file with 2 additions and 3 deletions.
    5 changes: 2 additions & 3 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,7 @@ def download_collection(collection_name, output_dir):
    def download_item(item_id, dir):
    item = ia.get_item(item_id, config=config)
    meta = item.metadata
    # try to get description
    title = meta['title']
    description = meta.get('description', None)
    formats = ['JPEG']
    for file in item.files:
    @@ -34,8 +34,7 @@ def download_item(item_id, dir):
    if description:
    f.write(description)
    else:
    print(f"no description for {filename}")
    f.write('')
    f.write(title)
    return
    print(f"no image for {item_id}")

  4. mstevenson revised this gist Nov 24, 2022. 1 changed file with 4 additions and 5 deletions.
    9 changes: 4 additions & 5 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -18,19 +18,17 @@ def download_item(item_id, dir):
    meta = item.metadata
    # try to get description
    description = meta.get('description', None)
    formats = ['JPEG', 'PNG', 'TIFF']
    formats = ['JPEG']
    for file in item.files:
    if file['format'] in formats:
    filename = file['name']
    ia.download(item_id, \
    files=filename, \
    item.download(files=filename, \
    formats=file['format'], \
    destdir=dir,
    no_directory=True, \
    verbose=True, \
    ignore_existing=True, \
    retries=2, \
    config=config)
    retries=2)
    caption = dir / Path(filename).with_suffix(".txt")
    with open(caption, 'w') as f:
    if description:
    @@ -39,6 +37,7 @@ def download_item(item_id, dir):
    print(f"no description for {filename}")
    f.write('')
    return
    print(f"no image for {item_id}")

    if __name__ == '__main__':
    parser = argparse.ArgumentParser()
  5. mstevenson revised this gist Nov 24, 2022. 1 changed file with 10 additions and 4 deletions.
    14 changes: 10 additions & 4 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -16,22 +16,28 @@ def download_collection(collection_name, output_dir):
    def download_item(item_id, dir):
    item = ia.get_item(item_id, config=config)
    meta = item.metadata
    description = meta['description']
    # try to get description
    description = meta.get('description', None)
    formats = ['JPEG', 'PNG', 'TIFF']
    for file in item.files:
    if file['format'] in formats:
    filename = file['name']
    ia.download(item_id, \
    files=file['name'], \
    files=filename, \
    formats=file['format'], \
    destdir=dir,
    no_directory=True, \
    verbose=True, \
    ignore_existing=True, \
    retries=2, \
    config=config)
    caption = dir / Path(file["name"]).with_suffix(".txt")
    caption = dir / Path(filename).with_suffix(".txt")
    with open(caption, 'w') as f:
    f.write(description)
    if description:
    f.write(description)
    else:
    print(f"no description for {filename}")
    f.write('')
    return

    if __name__ == '__main__':
  6. mstevenson revised this gist Nov 24, 2022. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ def download_item(item_id, dir):
    ignore_existing=True, \
    retries=2, \
    config=config)
    caption = dir / f'{item_id}.txt'
    caption = dir / Path(file["name"]).with_suffix(".txt")
    with open(caption, 'w') as f:
    f.write(description)
    return
  7. mstevenson created this gist Nov 24, 2022.
    45 changes: 45 additions & 0 deletions ia_image_downloader.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    import internetarchive as ia
    from pathlib import Path
    import argparse
    import time

    config = dict(general=dict(secure=False))

    def download_collection(collection_name, output_dir):
    search = ia.search_items(f'collection:{collection_name}', config=config)
    dir = Path(output_dir) / collection_name
    dir.mkdir(exist_ok=True)
    for result in search:
    download_item(result['identifier'], dir)
    time.sleep(0.2)

    def download_item(item_id, dir):
    item = ia.get_item(item_id, config=config)
    meta = item.metadata
    description = meta['description']
    formats = ['JPEG', 'PNG', 'TIFF']
    for file in item.files:
    if file['format'] in formats:
    ia.download(item_id, \
    files=file['name'], \
    formats=file['format'], \
    destdir=dir,
    no_directory=True, \
    verbose=True, \
    ignore_existing=True, \
    retries=2, \
    config=config)
    caption = dir / f'{item_id}.txt'
    with open(caption, 'w') as f:
    f.write(description)
    return

    if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('collection', help='Collection name')
    parser.add_argument('output_dir', help='Output directory')
    args = parser.parse_args()

    print(f'Downloading collection {args.collection} to {args.output_dir}')

    download_collection(args.collection, args.output_dir)