Last active
January 25, 2018 13:11
-
-
Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.
Revisions
-
VencentYoung revised this gist
Nov 8, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -188,7 +188,7 @@ def download_all_from_range(url, page_start, page_end, key_word_list, save_path) print(welcome_info) save_path = 'D:\\FTPRoot' key_words = ['夏洛特', '时代周刊'] m = search_item(key_words, get_range(Tech_Talk,1,20)) for s in m: print(s) -
VencentYoung created this gist
Nov 8, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,194 @@ # -*- coding:utf-8 -*- import urllib, http.cookiejar, requests import threading import os,re def get_page(url,timeout=20): # 仅返回网页,不做任何操作 try: request = urllib.request.Request(url) request.add_header('Referer', cl_url) request.add_header('User-Agent', 'Mozilla/5.0') page = urllib.request.urlopen(request,timeout=timeout).read().decode('gbk') return page except: print('>>> 页面下载失败...%s' % url) def get_item(url): # 通过分析网页返回该页所有的帖子信息:[地址, 标题] # 部分帖子(尤其是达盖尔)存在颜色标记, 需要额外去除 try: page = get_page(url) page = re.sub('[\n|\r|\t]|<font color=.+?>|</font>','',page) item_pattern = re.compile('(?<=<h3><a href="htm_data).+?(?=</a></h3>)') items = re.findall(item_pattern, page) res = [re.split('" target="_blank" id="">',item) for item in items] return res except: print('>>> 获取帖子信息失败...') def get_range(url, page_start, page_end): # 批量下载多页帖子信息,未实现多线程,需要优化 # 返回结果同 get_item items = [] for page_num in range(page_start, page_end+1): try: print('>>> 开始下载第 %d 页...' % page_num) items = items + get_item(url + '&page=%d' % page_num) print(' 第 %d 页下载成功' % page_num) except: print(' 第 %d 页下载失败' % page_num) finally: print('>>> -------------------------------') return items def search_item(key_word_list, items): print('>>> 目标包含 %d 条目, 按照关键词: %s 展开搜索...' % (len(items),' | '.join(key_word_list))) search_result = [] for item in items: for key_word in key_word_list: if key_word in item[1]: search_result.append(item) break print('>>> 共搜索到 %d 个主题' % len(search_result)) return search_result def get_torrent_hash(page): hash_pattern = re.compile('(?<=hash=).+?(?=&z">)') torrent_hash = re.findall(hash_pattern, page)[0] return torrent_hash def get_pic_urls(page): pic_pattern1 = re.compile('(?<=<input type=\'image\' src=\').+?(?=\'\s)') pic_pattern2 = re.compile('(?<=img src=\').+?(?=\'\s)') pic_urls = re.findall(pic_pattern1, page) + re.findall(pic_pattern2, page) # pic_pattern = re.compile('(?<=[<input type=\'image\' src=\'|img src=\']).+?(?=\'\s)') # pic_urls = re.findall(pic_pattern, page) return pic_urls def download_torrent(torrent_hash, torrent_name='', torrent_path=''): # 此处 url 对应为帖子地址 try: print('>>> 开始下载种子...') download_url = 'http://www.rmdown.com/link.php?hash=' + torrent_hash torrent_server = 'http://www.rmdown.com/download.php' header = {'User-Agent': 'Mozilla/5.0','Referer': download_url} request = urllib.request.Request(download_url,headers=header) page= urllib.request.urlopen(request).read().decode('utf-8') reff_pattern = re.compile('(?<=NAME="reff" value=").+?(?="><BR>)') torrent_reff = re.findall(reff_pattern, page)[0] payload = {'ref': (None,torrent_hash), 'reff': (None,torrent_reff), 'submit': (None,'download')} response = requests.post(torrent_server, files=payload, timeout=5) if len(torrent_name) == 0: torrent_name = torrent_hash else: torrent_name = re.sub('[>/:*\|?\\<]',' - ',torrent_name) if len(torrent_path) != 0: if not(os.path.exists(torrent_path)): os.makedirs(torrent_path) file_name = os.path.join(torrent_path, torrent_name + '.torrent') else: file_name = torrent_name + '.torrent' with open(file_name, "wb") as code: code.write(response.content) except Exception as e: print('>>> 从 %s 下载 %s 失败...' %(download_url, file_name)) print(e) finally: print('>>> -------------------------------') def download_pic(pic_url,pic_name='',pic_path=''): try: if len(pic_name) == 0: pic_name = re.split('/',pic_url)[-1] if len(pic_path) != 0: if not(os.path.exists(pic_path)): os.makedirs(pic_path) file_name = os.path.join(pic_path, pic_name) else: file_name = pic_name if os.path.isfile(file_name): print(' 文件已存在,无需重复下载') return r = requests.get(pic_url, timeout = 20) with open(file_name, "wb") as code: code.write(r.content) print(' 下载成功 %s' % pic_url) except Exception as e: print(e) print(' 下载失败 %s' % pic_url) def download_pics(pic_urls, pic_path): print('>>> 共 %d 张图片需要下载...' % len(pic_urls)) task_threads = [] for pic_url in pic_urls: t = threading.Thread(target = download_pic, args = (pic_url,'',pic_path)) task_threads.append(t) for task in task_threads: task.start() for task in task_threads: task.join() def download_pics_from_range(url, page_start, page_end, key_word_list, save_path): items = get_range(url, page_start, page_end) matched_items = search_item(key_word_list, items) for i in matched_items: print('>>> 下载主题 %s' % i[1]) page = get_page(cl_url+'htm_data'+i[0]) pic_urls = get_pic_urls(page) print(save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) def download_all_from_range(url, page_start, page_end, key_word_list, save_path): items = get_range(url, page_start, page_end) matched_items = search_item(key_word_list, items) for i in matched_items: print('>>> 下载主题 %s' % i[1]) page = get_page(cl_url+'htm_data'+i[0]) pic_urls = get_pic_urls(page) torrent_hash = get_torrent_hash(page) download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) download_torrent(torrent_hash, i[1], save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) if __name__ == '__main__': cl_url = 'http://你懂得/' # 定期更换 Asia_non_mosaic = cl_url + 'thread0806.php?fid=2' # 亚洲无码 Asia_mosaic = cl_url + 'thread0806.php?fid=15' # 亚洲有码 Original_Western = cl_url + 'thread0806.php?fid=4' # 欧美原创 Original_Animation = cl_url + 'thread0806.php?fid=5' # 动漫原创 Flag_of_Daguerre = cl_url + 'thread0806.php?fid=16' # 达盖尔的旗帜 New_Era_for_All = cl_url + 'thread0806.php?fid=8' # 新时代的我们 Tech_Talk = cl_url + 'thread0806.php?fid=7' # 技术讨论区 address_dic = {1: Asia_non_mosaic, 2: Asia_mosaic, 3: Original_Western, 4: Original_Animation, 5: Flag_of_Daguerre, 6: New_Era_for_All, 7: Tech_Talk} welcome_info = '''>>> 你,国之栋梁,请注意节制 1. 亚洲无码 Asia_non_mosaic 2. 亚洲有码 Asia_mosaic 3. 欧美原创 Original_Western 4. 动漫原创 Original_Animation 5. 达盖尔的旗帜 Flag_of_Daguerre 6. 新时代的我们 New_Era_for_All 7. 技术讨论区 Tech_Talk ''' print(welcome_info) save_path = 'D:\\FTPRoot' key_words = [''] m = search_item(key_words, get_range(Tech_Talk,1,20)) for s in m: print(s)