Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save looio/a50c7ca12bf804683261f11f28fe87b4 to your computer and use it in GitHub Desktop.
Save looio/a50c7ca12bf804683261f11f28fe87b4 to your computer and use it in GitHub Desktop.
Image downloader. You can download images from the web site.
# coding:utf-8
import sys
import os
import re
import time
import math
from urllib import request, error
from http import cookiejar
class Downloader():
dir_name = './DL_images'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) ' \
+ 'AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'
a_tag_pattern = re.compile('<[\s]*a[\s]*href[\s]*=.*?>')
a_link_pattern = re.compile('href[\s]*="(.*?)"')
img_tag_pattern = re.compile('<[\s]*img[\s]*src[\s]*=.*?>')
img_link_pattern = re.compile('src[\s]*="(.*?)"')
img_format = ['jpg', 'jpeg', 'png', 'gif', 'bmp']
def __init__(self, url, progress=None):
self.url = url
self.progress = progress
def run_download(self):
urls = self.__parse_html(self.fetch_html())
self.__export_file(urls)
def download(self, url, file_name):
opener = request.build_opener()
req = request.Request(url)
req.add_header('User-agent', self.user_agent)
try:
conn = opener.open(req)
with open(file_name, "wb") as img_file:
img_file.write(conn.read())
except (error.URLError, IOError) as e:
pass
def fetch_html(self):
cj = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', self.user_agent)]
html = None
try:
conn = opener.open(self.url)
except error.URLError as e:
pass
else:
html = conn.read().decode('utf-8')
return html if html else ''
def __parse_html(self, str):
if not str: return []
# fetch tag
a_tag_list = self.a_tag_pattern.findall(str)
img_tag_list = self.img_tag_pattern.findall(str)
urls = []
# fetch url
for a_tag in a_tag_list:
a_url_match = self.a_link_pattern.search(a_tag)
if a_url_match:
a_url = a_url_match.group(1)
a_words = a_url.split('.')
if a_words[-1].lower() in self.img_format:
urls.append(a_url)
for img_tag in img_tag_list:
img_url_match = self.img_link_pattern.search(img_tag)
if img_url_match:
img_url = img_url_match.group(1)
img_words = img_url.split('.')
if img_words[-1].lower() in self.img_format:
urls.append(img_url)
return urls
def __get_filename(self, path):
if not path : return ''
names = path.split('/')
return names[-1]
def __export_file(self, urls):
if not urls : return
times = str(time.time()).split('.')
dir_name = self.dir_name + '_' + times[0] + '/'
os.mkdir(dir_name)
if self.progress: self.progress.set_origin(len(urls))
count = 0
for url in urls:
file_name = dir_name + self.__get_filename(url)
self.download(url, file_name)
count += 1
if self.progress: self.progress.show(count)
class Progress():
max_gauge = 40
def __init__(self):
pass
def set_origin(self, origin):
self.origin = origin
def show(self, increment):
rate = self.__calc(increment)
#lf = '\n' if rate == self.max_gauge else ''
lf = ''
val = '\rprogress: {0}{1}'.format('#' * rate, lf)
sys.stdout.write(val)
def __calc(self, increment):
rate = round(increment / self.origin, 2)
now_rate = math.ceil(self.max_gauge * rate)
return now_rate
if __name__ == '__main__':
param = sys.argv
if len(param) < 2:
print('no args')
sys.exit()
print('download start')
dl = Downloader(param[1], Progress())
dl.run_download()
print('\n')
print('download finish')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment