# coding:utf-8 import sys import os import re import time import math from urllib import request, error from http import cookiejar class Downloader(): dir_name = './DL_images' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) ' \ + 'AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25' a_tag_pattern = re.compile('<[\s]*a[\s]*href[\s]*=.*?>') a_link_pattern = re.compile('href[\s]*="(.*?)"') img_tag_pattern = re.compile('<[\s]*img[\s]*src[\s]*=.*?>') img_link_pattern = re.compile('src[\s]*="(.*?)"') img_format = ['jpg', 'jpeg', 'png', 'gif', 'bmp'] def __init__(self, url, progress=None): self.url = url self.progress = progress def run_download(self): urls = self.__parse_html(self.fetch_html()) self.__export_file(urls) def download(self, url, file_name): opener = request.build_opener() req = request.Request(url) req.add_header('User-agent', self.user_agent) try: conn = opener.open(req) with open(file_name, "wb") as img_file: img_file.write(conn.read()) except (error.URLError, IOError) as e: pass def fetch_html(self): cj = cookiejar.CookieJar() opener = request.build_opener(request.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', self.user_agent)] html = None try: conn = opener.open(self.url) except error.URLError as e: pass else: html = conn.read().decode('utf-8') return html if html else '' def __parse_html(self, str): if not str: return [] # fetch tag a_tag_list = self.a_tag_pattern.findall(str) img_tag_list = self.img_tag_pattern.findall(str) urls = [] # fetch url for a_tag in a_tag_list: a_url_match = self.a_link_pattern.search(a_tag) if a_url_match: a_url = a_url_match.group(1) a_words = a_url.split('.') if a_words[-1].lower() in self.img_format: urls.append(a_url) for img_tag in img_tag_list: img_url_match = self.img_link_pattern.search(img_tag) if img_url_match: img_url = img_url_match.group(1) img_words = img_url.split('.') if img_words[-1].lower() in self.img_format: urls.append(img_url) return urls def __get_filename(self, path): if not path : return '' names = path.split('/') return names[-1] def __export_file(self, urls): if not urls : return times = str(time.time()).split('.') dir_name = self.dir_name + '_' + times[0] + '/' os.mkdir(dir_name) if self.progress: self.progress.set_origin(len(urls)) count = 0 for url in urls: file_name = dir_name + self.__get_filename(url) self.download(url, file_name) count += 1 if self.progress: self.progress.show(count) class Progress(): max_gauge = 40 def __init__(self): pass def set_origin(self, origin): self.origin = origin def show(self, increment): rate = self.__calc(increment) #lf = '\n' if rate == self.max_gauge else '' lf = '' val = '\rprogress: {0}{1}'.format('#' * rate, lf) sys.stdout.write(val) def __calc(self, increment): rate = round(increment / self.origin, 2) now_rate = math.ceil(self.max_gauge * rate) return now_rate if __name__ == '__main__': param = sys.argv if len(param) < 2: print('no args') sys.exit() print('download start') dl = Downloader(param[1], Progress()) dl.run_download() print('\n') print('download finish')