Skip to content

Instantly share code, notes, and snippets.

@sokol8
Forked from Bharat-B/README.md
Created February 3, 2024 05:12
Show Gist options
  • Select an option

  • Save sokol8/ea13d9c8d2022cfefc36795b67c6f2db to your computer and use it in GitHub Desktop.

Select an option

Save sokol8/ea13d9c8d2022cfefc36795b67c6f2db to your computer and use it in GitHub Desktop.

Revisions

  1. @Bharat-B Bharat-B revised this gist Oct 3, 2018. 1 changed file with 27 additions and 0 deletions.
    27 changes: 27 additions & 0 deletions README.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,27 @@
    # Python

    A script that downloads multiple files in parallel with support for s3:// | http:// | https:// protocols


    ## Description

    ### `downloader.py`

    Usage: `python downloader.py url1 url2 url3`

    ---


    ## Installation

    **Mac OS X**: A version of Python is already installed.
    **Windows**: You will need to install one of the 2.x versions available at [python.org](http://www.python.org/getit/).

    ## Dependencies

    Some of the required additional Python packages need to be installed to run on the command line.
    Here is a list of the packages:

    * Use pip to install
    * boto3
    * requests
  2. @Bharat-B Bharat-B created this gist Oct 3, 2018.
    98 changes: 98 additions & 0 deletions multi-downloader.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    #!/usr/bin/env python
    import sys
    import boto3
    import os
    import requests
    import botocore
    from multiprocessing import Process
    import re

    #Collect arguements and remove the first element as it would be the filename
    urls = sys.argv
    del urls[0]
    #Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download )
    filtered_urls = {}

    def is_downloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url)
    header = h.headers
    content_type = header.get('content-type')
    if 'text' in content_type.lower():
    return False
    if 'html' in content_type.lower():
    return False
    return True

    def s3dl(path):
    try:
    # Match regex and breakdown to get bucket, filepath and filename
    uri = re.match(r's3:\/\/(.+?)\/(.+)',path)
    bucket = uri.group(1)
    file_path = uri.group(2)
    if file_path.find('/'):
    file_name = file_path.rsplit('/', 1)[1]
    else:
    file_name = file_path
    s3 = boto3.resource('s3')
    s3.Bucket(bucket).download_file(file_path, file_name)
    return True
    except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
    print("The object does not exist.")
    else:
    raise

    def httpdl(path):
    try:
    # Check if file is downloadable
    if is_downloadable(path):
    # Get file name from url
    filename = path.rsplit('/',1)[-1]
    r = requests.get(path, stream=True)
    with open(filename, 'wb') as f:
    for chunk in r.iter_content(chunk_size=1024):
    if chunk:
    f.write(chunk)
    else:
    print('File is not downloadable.')
    return False
    except:
    print("Error downloading file.")
    return False

    if __name__ == "__main__":
    # Itterate through urls and filter them by schema
    for url in urls:
    explode = url.split('://')
    if not explode[0] in filtered_urls:
    filtered_urls[explode[0]] = []
    filtered_urls[explode[0]].append(explode[1])

    #Create an array that will hold the simultaneous processes that will be responsible for downloads
    processes = []

    # Itterate through filtered urls and download them by relative method
    for download_type in filtered_urls:
    for path in filtered_urls[download_type]:
    if download_type == "s3":
    download = download_type+'://'+path
    process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path))
    processes.append(process)
    if download_type == "http" or download_type == "https":
    download = download_type+'://'+path
    process = Process(target=httpdl,args=(download,))
    processes.append(process)

    # Start the processes
    for process in processes:
    process.start()

    # Ensure all processes are done and list their time to download as well
    for process in processes:
    process.join()

    print(process)
    print("All downloads have been completed")