-
-
Save looio/a50c7ca12bf804683261f11f28fe87b4 to your computer and use it in GitHub Desktop.
Image downloader. You can download images from the web site.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding:utf-8 | |
| import sys | |
| import os | |
| import re | |
| import time | |
| import math | |
| from urllib import request, error | |
| from http import cookiejar | |
| class Downloader(): | |
| dir_name = './DL_images' | |
| user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) ' \ | |
| + 'AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25' | |
| a_tag_pattern = re.compile('<[\s]*a[\s]*href[\s]*=.*?>') | |
| a_link_pattern = re.compile('href[\s]*="(.*?)"') | |
| img_tag_pattern = re.compile('<[\s]*img[\s]*src[\s]*=.*?>') | |
| img_link_pattern = re.compile('src[\s]*="(.*?)"') | |
| img_format = ['jpg', 'jpeg', 'png', 'gif', 'bmp'] | |
| def __init__(self, url, progress=None): | |
| self.url = url | |
| self.progress = progress | |
| def run_download(self): | |
| urls = self.__parse_html(self.fetch_html()) | |
| self.__export_file(urls) | |
| def download(self, url, file_name): | |
| opener = request.build_opener() | |
| req = request.Request(url) | |
| req.add_header('User-agent', self.user_agent) | |
| try: | |
| conn = opener.open(req) | |
| with open(file_name, "wb") as img_file: | |
| img_file.write(conn.read()) | |
| except (error.URLError, IOError) as e: | |
| pass | |
| def fetch_html(self): | |
| cj = cookiejar.CookieJar() | |
| opener = request.build_opener(request.HTTPCookieProcessor(cj)) | |
| opener.addheaders = [('User-agent', self.user_agent)] | |
| html = None | |
| try: | |
| conn = opener.open(self.url) | |
| except error.URLError as e: | |
| pass | |
| else: | |
| html = conn.read().decode('utf-8') | |
| return html if html else '' | |
| def __parse_html(self, str): | |
| if not str: return [] | |
| # fetch tag | |
| a_tag_list = self.a_tag_pattern.findall(str) | |
| img_tag_list = self.img_tag_pattern.findall(str) | |
| urls = [] | |
| # fetch url | |
| for a_tag in a_tag_list: | |
| a_url_match = self.a_link_pattern.search(a_tag) | |
| if a_url_match: | |
| a_url = a_url_match.group(1) | |
| a_words = a_url.split('.') | |
| if a_words[-1].lower() in self.img_format: | |
| urls.append(a_url) | |
| for img_tag in img_tag_list: | |
| img_url_match = self.img_link_pattern.search(img_tag) | |
| if img_url_match: | |
| img_url = img_url_match.group(1) | |
| img_words = img_url.split('.') | |
| if img_words[-1].lower() in self.img_format: | |
| urls.append(img_url) | |
| return urls | |
| def __get_filename(self, path): | |
| if not path : return '' | |
| names = path.split('/') | |
| return names[-1] | |
| def __export_file(self, urls): | |
| if not urls : return | |
| times = str(time.time()).split('.') | |
| dir_name = self.dir_name + '_' + times[0] + '/' | |
| os.mkdir(dir_name) | |
| if self.progress: self.progress.set_origin(len(urls)) | |
| count = 0 | |
| for url in urls: | |
| file_name = dir_name + self.__get_filename(url) | |
| self.download(url, file_name) | |
| count += 1 | |
| if self.progress: self.progress.show(count) | |
| class Progress(): | |
| max_gauge = 40 | |
| def __init__(self): | |
| pass | |
| def set_origin(self, origin): | |
| self.origin = origin | |
| def show(self, increment): | |
| rate = self.__calc(increment) | |
| #lf = '\n' if rate == self.max_gauge else '' | |
| lf = '' | |
| val = '\rprogress: {0}{1}'.format('#' * rate, lf) | |
| sys.stdout.write(val) | |
| def __calc(self, increment): | |
| rate = round(increment / self.origin, 2) | |
| now_rate = math.ceil(self.max_gauge * rate) | |
| return now_rate | |
| if __name__ == '__main__': | |
| param = sys.argv | |
| if len(param) < 2: | |
| print('no args') | |
| sys.exit() | |
| print('download start') | |
| dl = Downloader(param[1], Progress()) | |
| dl.run_download() | |
| print('\n') | |
| print('download finish') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment