VencentYoung · January 25, 2018 13:11 · Nov 8, 2015 · Nov 8, 2015
diff --git a/1024.py b/1024.py
@@ -188,7 +188,7 @@ def download_all_from_range(url, page_start, page_end, key_word_list, save_path)
     print(welcome_info)
 
     save_path = 'D:\\FTPRoot'
-    key_words = ['']
+    key_words = ['夏洛特', '时代周刊']
     m = search_item(key_words, get_range(Tech_Talk,1,20))
     for s in m:
         print(s)
diff --git a/1024.py b/1024.py
@@ -0,0 +1,194 @@
+# -*- coding:utf-8 -*-
+
+import urllib, http.cookiejar, requests
+import threading
+import os,re
+
+def get_page(url,timeout=20):
+    # 仅返回网页，不做任何操作
+    try:
+        request = urllib.request.Request(url)
+        request.add_header('Referer', cl_url)
+        request.add_header('User-Agent', 'Mozilla/5.0')
+        page = urllib.request.urlopen(request,timeout=timeout).read().decode('gbk')
+        return page
+    except:
+        print('>>> 页面下载失败...%s' % url)
+
+def get_item(url):
+    # 通过分析网页返回该页所有的帖子信息：[地址, 标题]
+    # 部分帖子(尤其是达盖尔)存在颜色标记, 需要额外去除
+    try:
+        page = get_page(url)
+        page = re.sub('[\n|\r|\t]|<font color=.+?>|</font>','',page)
+        item_pattern = re.compile('(?<=<h3><a href="htm_data).+?(?=</a></h3>)')
+        items = re.findall(item_pattern, page)
+        res = [re.split('" target="_blank" id="">',item) for item in items]
+        return res
+    except:
+        print('>>> 获取帖子信息失败...')
+
+def get_range(url, page_start, page_end):
+    # 批量下载多页帖子信息，未实现多线程，需要优化
+    # 返回结果同 get_item
+    items = []
+    for page_num in range(page_start, page_end+1):
+        try:
+            print('>>> 开始下载第 %d 页...' % page_num)
+            items = items + get_item(url + '&page=%d' % page_num)
+            print('    第 %d 页下载成功' % page_num)
+        except:
+            print('    第 %d 页下载失败' % page_num)
+        finally:
+            print('>>> -------------------------------')
+    return items
+
+def search_item(key_word_list, items):
+    print('>>> 目标包含 %d 条目, 按照关键词: %s 展开搜索...' % (len(items),' | '.join(key_word_list)))
+    search_result = []
+    for item in items:
+        for key_word in key_word_list:
+            if key_word in item[1]:
+                search_result.append(item)
+                break
+    print('>>> 共搜索到 %d 个主题' % len(search_result))
+    return search_result
+
+def get_torrent_hash(page):
+    hash_pattern = re.compile('(?<=hash=).+?(?=&z">)') 
+    torrent_hash = re.findall(hash_pattern, page)[0]
+    return torrent_hash
+
+def get_pic_urls(page):
+    pic_pattern1 = re.compile('(?<=<input type=\'image\' src=\').+?(?=\'\s)')
+    pic_pattern2 = re.compile('(?<=img src=\').+?(?=\'\s)')
+    pic_urls = re.findall(pic_pattern1, page) + re.findall(pic_pattern2, page)
+    # pic_pattern = re.compile('(?<=[<input type=\'image\' src=\'|img src=\']).+?(?=\'\s)')
+    # pic_urls = re.findall(pic_pattern, page)
+    return pic_urls
+
+def download_torrent(torrent_hash, torrent_name='', torrent_path=''):
+    # 此处 url 对应为帖子地址
+    try:
+        print('>>> 开始下载种子...')
+        download_url = 'http://www.rmdown.com/link.php?hash=' + torrent_hash
+        torrent_server = 'http://www.rmdown.com/download.php'
+
+        header = {'User-Agent': 'Mozilla/5.0','Referer': download_url}
+        request = urllib.request.Request(download_url,headers=header)
+        page= urllib.request.urlopen(request).read().decode('utf-8') 
+        reff_pattern = re.compile('(?<=NAME="reff" value=").+?(?="><BR>)')
+        torrent_reff = re.findall(reff_pattern, page)[0]
+
+        payload = {'ref': (None,torrent_hash),
+                   'reff': (None,torrent_reff),
+                   'submit': (None,'download')}
+        response = requests.post(torrent_server, files=payload, timeout=5)
+
+        if len(torrent_name) == 0:
+            torrent_name = torrent_hash
+        else:
+            torrent_name = re.sub('[>/:*\|?\\<]',' - ',torrent_name)
+
+        if len(torrent_path) != 0:
+            if not(os.path.exists(torrent_path)):
+                os.makedirs(torrent_path)
+            file_name = os.path.join(torrent_path, torrent_name + '.torrent')
+        else:
+            file_name = torrent_name + '.torrent'
+
+        with open(file_name, "wb") as code:
+            code.write(response.content)
+    except Exception as e:
+        print('>>> 从 %s 下载 %s 失败...' %(download_url, file_name))
+        print(e)
+    finally:
+        print('>>> -------------------------------')
+
+def download_pic(pic_url,pic_name='',pic_path=''):
+    try:
+        if len(pic_name) == 0: pic_name = re.split('/',pic_url)[-1]
+        if len(pic_path) != 0:
+            if not(os.path.exists(pic_path)):
+                os.makedirs(pic_path)
+            file_name = os.path.join(pic_path, pic_name)
+        else:
+            file_name = pic_name
+        if os.path.isfile(file_name):
+            print('    文件已存在，无需重复下载')
+            return
+        r = requests.get(pic_url, timeout = 20)
+        with open(file_name, "wb") as code:
+            code.write(r.content)
+        print('    下载成功 %s' % pic_url)
+    except Exception as e:
+        print(e)
+        print('    下载失败 %s' % pic_url)
+
+def download_pics(pic_urls, pic_path):
+    print('>>> 共 %d 张图片需要下载...' % len(pic_urls))
+    task_threads = []
+    for pic_url in pic_urls:
+        t = threading.Thread(target = download_pic, args = (pic_url,'',pic_path))
+        task_threads.append(t)
+    for task in task_threads:
+        task.start()
+    for task in task_threads:
+        task.join()
+
+def download_pics_from_range(url, page_start, page_end, key_word_list, save_path):
+    items = get_range(url, page_start, page_end)
+    matched_items = search_item(key_word_list, items)
+    for i in matched_items:
+        print('>>> 下载主题 %s' % i[1])
+        page = get_page(cl_url+'htm_data'+i[0])
+        pic_urls = get_pic_urls(page)
+        print(save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
+        download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
+
+def download_all_from_range(url, page_start, page_end, key_word_list, save_path):
+    items = get_range(url, page_start, page_end)
+    matched_items = search_item(key_word_list, items)
+    for i in matched_items:
+        print('>>> 下载主题 %s' % i[1])
+        page = get_page(cl_url+'htm_data'+i[0])
+        pic_urls = get_pic_urls(page)
+        torrent_hash = get_torrent_hash(page)
+        download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
+        download_torrent(torrent_hash, i[1], save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
+
+if __name__ == '__main__':
+
+    cl_url = 'http://你懂得/' # 定期更换
+    Asia_non_mosaic    = cl_url + 'thread0806.php?fid=2'   # 亚洲无码
+    Asia_mosaic        = cl_url + 'thread0806.php?fid=15'  # 亚洲有码
+    Original_Western   = cl_url + 'thread0806.php?fid=4'   # 欧美原创
+    Original_Animation = cl_url + 'thread0806.php?fid=5'   # 动漫原创
+    Flag_of_Daguerre   = cl_url + 'thread0806.php?fid=16'  # 达盖尔的旗帜
+    New_Era_for_All    = cl_url + 'thread0806.php?fid=8'   # 新时代的我们
+    Tech_Talk          = cl_url + 'thread0806.php?fid=7'   # 技术讨论区
+    address_dic = {1: Asia_non_mosaic,
+                   2: Asia_mosaic,
+                   3: Original_Western,
+                   4: Original_Animation,
+                   5: Flag_of_Daguerre,
+                   6: New_Era_for_All,
+                   7: Tech_Talk}
+
+    welcome_info = '''>>> 你，国之栋梁，请注意节制
+
+    1. 亚洲无码     Asia_non_mosaic
+    2. 亚洲有码     Asia_mosaic
+    3. 欧美原创     Original_Western
+    4. 动漫原创     Original_Animation
+    5. 达盖尔的旗帜 Flag_of_Daguerre
+    6. 新时代的我们 New_Era_for_All 
+    7. 技术讨论区   Tech_Talk
+    '''
+    print(welcome_info)
+
+    save_path = 'D:\\FTPRoot'
+    key_words = ['']
+    m = search_item(key_words, get_range(Tech_Talk,1,20))
+    for s in m:
+        print(s)
No results found