sokol8 · February 3, 2024 05:12 · Oct 3, 2018 · Oct 3, 2018
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+# Python
+
+A script that downloads multiple files in parallel with support for s3:// | http:// | https:// protocols 
+
+
+## Description
+
+### `downloader.py`
+
+Usage: `python downloader.py url1 url2 url3`
+
+---
+
+
+## Installation
+
+**Mac OS X**: A version of Python is already installed.  
+**Windows**: You will need to install one of the 2.x versions available at [python.org](http://www.python.org/getit/).
+
+## Dependencies
+
+Some of the required additional Python packages need to be installed to run on the command line.
+Here is a list of the packages:
+
+* Use pip to install
+    * boto3
+    * requests
diff --git a/multi-downloader.py b/multi-downloader.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+import sys
+import boto3
+import os
+import requests
+import botocore
+from multiprocessing import Process
+import re 
+
+#Collect arguements and remove the first element as it would be the filename
+urls = sys.argv
+del urls[0]
+#Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download )
+filtered_urls = {}
+
+def is_downloadable(url):
+	"""
+	Does the url contain a downloadable resource
+	"""
+	h = requests.head(url)
+	header = h.headers
+	content_type = header.get('content-type')
+	if 'text' in content_type.lower():
+		return False
+	if 'html' in content_type.lower():
+		return False
+	return True
+
+def s3dl(path):
+	try:
+		# Match regex and breakdown to get bucket, filepath and filename
+		uri = re.match(r's3:\/\/(.+?)\/(.+)',path)
+		bucket = uri.group(1)
+		file_path = uri.group(2)
+		if file_path.find('/'):
+			file_name = file_path.rsplit('/', 1)[1]
+		else:
+			file_name = file_path
+		s3 = boto3.resource('s3')
+		s3.Bucket(bucket).download_file(file_path, file_name)
+		return True
+	except botocore.exceptions.ClientError as e:
+		if e.response['Error']['Code'] == "404":
+			print("The object does not exist.")
+		else:
+			raise
+
+def httpdl(path):
+	try:
+		# Check if file is downloadable
+		if is_downloadable(path):
+			# Get file name from url
+			filename = path.rsplit('/',1)[-1]
+			r = requests.get(path, stream=True)
+			with open(filename, 'wb') as f:
+				for chunk in r.iter_content(chunk_size=1024):
+					if chunk:
+						f.write(chunk)
+		else:
+			print('File is not downloadable.')
+			return False
+	except:
+		print("Error downloading file.")
+		return False
+
+if __name__ == "__main__":
+	# Itterate through urls and filter them by schema
+	for url in urls:
+		explode = url.split('://')
+		if not explode[0] in filtered_urls:
+			filtered_urls[explode[0]] = []
+		filtered_urls[explode[0]].append(explode[1])
+
+	#Create an array that will hold the simultaneous processes that will be responsible for downloads
+	processes = []
+
+	# Itterate through filtered urls and download them by relative method
+	for download_type in filtered_urls:
+		for path in filtered_urls[download_type]:
+			if download_type == "s3":
+				download = download_type+'://'+path
+				process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path))
+				processes.append(process)
+			if download_type == "http" or download_type == "https":
+				download = download_type+'://'+path
+				process = Process(target=httpdl,args=(download,))
+				processes.append(process)
+
+	# Start the processes
+	for process in processes:
+		process.start()
+
+	# Ensure all processes are done and list their time to download as well
+	for process in processes:
+		process.join()
+
+print(process)
+print("All downloads have been completed")
No results found