looio · June 7, 2016 11:29
diff --git a/simple_image_downloader.py b/simple_image_downloader.py
 # coding:utf-8


 import sys
 import os
 import re
 import time
 import math
 from urllib import request, error
 from http import cookiejar


 class Downloader():

    dir_name = './DL_images'
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) ' \
                 + 'AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'
    a_tag_pattern = re.compile('<[\s]*a[\s]*href[\s]*=.*?>')
    a_link_pattern = re.compile('href[\s]*="(.*?)"')
    img_tag_pattern = re.compile('<[\s]*img[\s]*src[\s]*=.*?>')
    img_link_pattern = re.compile('src[\s]*="(.*?)"')
    img_format = ['jpg', 'jpeg', 'png', 'gif', 'bmp']

    def __init__(self, url, progress=None):
        self.url = url
        self.progress = progress

    def run_download(self):
        urls = self.__parse_html(self.fetch_html())
        self.__export_file(urls)

    def download(self, url, file_name):
        opener = request.build_opener()
        req = request.Request(url)
        req.add_header('User-agent', self.user_agent)
        try:
            conn = opener.open(req)
            with open(file_name, "wb") as img_file:
                img_file.write(conn.read())
        except (error.URLError, IOError) as e:
            pass

    def fetch_html(self):
        cj = cookiejar.CookieJar()
        opener = request.build_opener(request.HTTPCookieProcessor(cj))
        opener.addheaders = [('User-agent', self.user_agent)]
        html = None
        try:
            conn = opener.open(self.url)
        except error.URLError as e:
            pass
        else:
            html = conn.read().decode('utf-8')
        return html if html else ''

    def __parse_html(self, str):
        if not str: return []
        # fetch tag
        a_tag_list = self.a_tag_pattern.findall(str)
        img_tag_list = self.img_tag_pattern.findall(str)

        urls = []
        # fetch url
        for a_tag in a_tag_list:
            a_url_match = self.a_link_pattern.search(a_tag)
            if a_url_match:
                a_url = a_url_match.group(1)
                a_words = a_url.split('.')
                if a_words[-1].lower() in self.img_format:
                    urls.append(a_url)
        for img_tag in img_tag_list:
            img_url_match = self.img_link_pattern.search(img_tag)
            if img_url_match:
                img_url = img_url_match.group(1)
                img_words = img_url.split('.')
                if img_words[-1].lower() in self.img_format:
                    urls.append(img_url)
        return urls

    def __get_filename(self, path):
        if not path : return ''
        names = path.split('/')
        return names[-1]

    def __export_file(self, urls):
        if not urls : return
        times = str(time.time()).split('.')
        dir_name = self.dir_name + '_' + times[0] + '/'
        os.mkdir(dir_name)
        if self.progress: self.progress.set_origin(len(urls))
        count = 0
        for url in urls:
            file_name = dir_name + self.__get_filename(url)
            self.download(url, file_name)
            count += 1
            if self.progress: self.progress.show(count)


 class Progress():

    max_gauge = 40

    def __init__(self):
        pass

    def set_origin(self, origin):
        self.origin = origin

    def show(self, increment):
        rate = self.__calc(increment)
        #lf = '\n' if rate == self.max_gauge else ''
        lf = ''
        val = '\rprogress: {0}{1}'.format('#' * rate, lf)
        sys.stdout.write(val)

    def __calc(self, increment):
        rate = round(increment / self.origin, 2)
        now_rate = math.ceil(self.max_gauge * rate)
        return now_rate

 if __name__ == '__main__':
    param = sys.argv
    if len(param) < 2:
        print('no args')
        sys.exit()
    print('download start')
    dl = Downloader(param[1], Progress())
    dl.run_download()
    print('\n')
    print('download finish')
	# coding:utf-8


	import sys
	import os
	import re
	import time
	import math
	from urllib import request, error
	from http import cookiejar


	class Downloader():

	dir_name = './DL_images'
	user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) ' \
	+ 'AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'
	a_tag_pattern = re.compile('<[\s]a[\s]href[\s]=.?>')
	a_link_pattern = re.compile('href[\s]="(.?)"')
	img_tag_pattern = re.compile('<[\s]img[\s]src[\s]=.?>')
	img_link_pattern = re.compile('src[\s]="(.?)"')
	img_format = ['jpg', 'jpeg', 'png', 'gif', 'bmp']

	def __init__(self, url, progress=None):
	self.url = url
	self.progress = progress

	def run_download(self):
	urls = self.__parse_html(self.fetch_html())
	self.__export_file(urls)

	def download(self, url, file_name):
	opener = request.build_opener()
	req = request.Request(url)
	req.add_header('User-agent', self.user_agent)
	try:
	conn = opener.open(req)
	with open(file_name, "wb") as img_file:
	img_file.write(conn.read())
	except (error.URLError, IOError) as e:
	pass

	def fetch_html(self):
	cj = cookiejar.CookieJar()
	opener = request.build_opener(request.HTTPCookieProcessor(cj))
	opener.addheaders = [('User-agent', self.user_agent)]
	html = None
	try:
	conn = opener.open(self.url)
	except error.URLError as e:
	pass
	else:
	html = conn.read().decode('utf-8')
	return html if html else ''

	def __parse_html(self, str):
	if not str: return []
	# fetch tag
	a_tag_list = self.a_tag_pattern.findall(str)
	img_tag_list = self.img_tag_pattern.findall(str)

	urls = []
	# fetch url
	for a_tag in a_tag_list:
	a_url_match = self.a_link_pattern.search(a_tag)
	if a_url_match:
	a_url = a_url_match.group(1)
	a_words = a_url.split('.')
	if a_words[-1].lower() in self.img_format:
	urls.append(a_url)
	for img_tag in img_tag_list:
	img_url_match = self.img_link_pattern.search(img_tag)
	if img_url_match:
	img_url = img_url_match.group(1)
	img_words = img_url.split('.')
	if img_words[-1].lower() in self.img_format:
	urls.append(img_url)
	return urls

	def __get_filename(self, path):
	if not path : return ''
	names = path.split('/')
	return names[-1]

	def __export_file(self, urls):
	if not urls : return
	times = str(time.time()).split('.')
	dir_name = self.dir_name + '_' + times[0] + '/'
	os.mkdir(dir_name)
	if self.progress: self.progress.set_origin(len(urls))
	count = 0
	for url in urls:
	file_name = dir_name + self.__get_filename(url)
	self.download(url, file_name)
	count += 1
	if self.progress: self.progress.show(count)


	class Progress():

	max_gauge = 40

	def __init__(self):
	pass

	def set_origin(self, origin):
	self.origin = origin

	def show(self, increment):
	rate = self.__calc(increment)
	#lf = '\n' if rate == self.max_gauge else ''
	lf = ''
	val = '\rprogress: {0}{1}'.format('#' * rate, lf)
	sys.stdout.write(val)

	def __calc(self, increment):
	rate = round(increment / self.origin, 2)
	now_rate = math.ceil(self.max_gauge * rate)
	return now_rate

	if __name__ == '__main__':
	param = sys.argv
	if len(param) < 2:
	print('no args')
	sys.exit()
	print('download start')
	dl = Downloader(param[1], Progress())
	dl.run_download()
	print('\n')
	print('download finish')