Last active
November 25, 2022 00:47
-
-
Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.
Revisions
-
mstevenson revised this gist
Nov 25, 2022 . 1 changed file with 16 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -17,26 +17,32 @@ def download_item(item_id, dir): item = ia.get_item(item_id, config=config) meta = item.metadata title = meta['title'] if isinstance(title, list): # not sure why a list is sometimes returned, so just move on return description = meta.get('description', None) formats = ['JPEG'] for file in item.files: if file['format'] in formats: filename = file['name'] try: item.download(files=filename, \ formats=file['format'], \ destdir=dir, no_directory=True, \ verbose=True, \ ignore_existing=True, \ retries=2) except Exception as e: print(f'Download failed: {e}') return caption = dir / Path(filename).with_suffix('.txt') with open(caption, 'w', encoding='utf-8') as f: if description and not isinstance(description, list): f.write(description) else: f.write(title) return print(f'no image for {item_id}') if __name__ == '__main__': parser = argparse.ArgumentParser() -
mstevenson revised this gist
Nov 24, 2022 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,7 +11,7 @@ def download_collection(collection_name, output_dir): dir.mkdir(exist_ok=True) for result in search: download_item(result['identifier'], dir) time.sleep(0.1) def download_item(item_id, dir): item = ia.get_item(item_id, config=config) @@ -31,7 +31,7 @@ def download_item(item_id, dir): retries=2) caption = dir / Path(filename).with_suffix(".txt") with open(caption, 'w') as f: if description and not isinstance(description, list): f.write(description) else: f.write(title) -
mstevenson revised this gist
Nov 24, 2022 . 1 changed file with 2 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,7 +16,7 @@ def download_collection(collection_name, output_dir): def download_item(item_id, dir): item = ia.get_item(item_id, config=config) meta = item.metadata title = meta['title'] description = meta.get('description', None) formats = ['JPEG'] for file in item.files: @@ -34,8 +34,7 @@ def download_item(item_id, dir): if description: f.write(description) else: f.write(title) return print(f"no image for {item_id}") -
mstevenson revised this gist
Nov 24, 2022 . 1 changed file with 4 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -18,19 +18,17 @@ def download_item(item_id, dir): meta = item.metadata # try to get description description = meta.get('description', None) formats = ['JPEG'] for file in item.files: if file['format'] in formats: filename = file['name'] item.download(files=filename, \ formats=file['format'], \ destdir=dir, no_directory=True, \ verbose=True, \ ignore_existing=True, \ retries=2) caption = dir / Path(filename).with_suffix(".txt") with open(caption, 'w') as f: if description: @@ -39,6 +37,7 @@ def download_item(item_id, dir): print(f"no description for {filename}") f.write('') return print(f"no image for {item_id}") if __name__ == '__main__': parser = argparse.ArgumentParser() -
mstevenson revised this gist
Nov 24, 2022 . 1 changed file with 10 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,22 +16,28 @@ def download_collection(collection_name, output_dir): def download_item(item_id, dir): item = ia.get_item(item_id, config=config) meta = item.metadata # try to get description description = meta.get('description', None) formats = ['JPEG', 'PNG', 'TIFF'] for file in item.files: if file['format'] in formats: filename = file['name'] ia.download(item_id, \ files=filename, \ formats=file['format'], \ destdir=dir, no_directory=True, \ verbose=True, \ ignore_existing=True, \ retries=2, \ config=config) caption = dir / Path(filename).with_suffix(".txt") with open(caption, 'w') as f: if description: f.write(description) else: print(f"no description for {filename}") f.write('') return if __name__ == '__main__': -
mstevenson revised this gist
Nov 24, 2022 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -29,7 +29,7 @@ def download_item(item_id, dir): ignore_existing=True, \ retries=2, \ config=config) caption = dir / Path(file["name"]).with_suffix(".txt") with open(caption, 'w') as f: f.write(description) return -
mstevenson created this gist
Nov 24, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,45 @@ import internetarchive as ia from pathlib import Path import argparse import time config = dict(general=dict(secure=False)) def download_collection(collection_name, output_dir): search = ia.search_items(f'collection:{collection_name}', config=config) dir = Path(output_dir) / collection_name dir.mkdir(exist_ok=True) for result in search: download_item(result['identifier'], dir) time.sleep(0.2) def download_item(item_id, dir): item = ia.get_item(item_id, config=config) meta = item.metadata description = meta['description'] formats = ['JPEG', 'PNG', 'TIFF'] for file in item.files: if file['format'] in formats: ia.download(item_id, \ files=file['name'], \ formats=file['format'], \ destdir=dir, no_directory=True, \ verbose=True, \ ignore_existing=True, \ retries=2, \ config=config) caption = dir / f'{item_id}.txt' with open(caption, 'w') as f: f.write(description) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('collection', help='Collection name') parser.add_argument('output_dir', help='Output directory') args = parser.parse_args() print(f'Downloading collection {args.collection} to {args.output_dir}') download_collection(args.collection, args.output_dir)