Skip to content

Instantly share code, notes, and snippets.

@VencentYoung
Last active January 25, 2018 13:11
Show Gist options
  • Select an option

  • Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.

Select an option

Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.

Revisions

  1. VencentYoung revised this gist Nov 8, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion 1024.py
    Original file line number Diff line number Diff line change
    @@ -188,7 +188,7 @@ def download_all_from_range(url, page_start, page_end, key_word_list, save_path)
    print(welcome_info)

    save_path = 'D:\\FTPRoot'
    key_words = ['']
    key_words = ['夏洛特', '时代周刊']
    m = search_item(key_words, get_range(Tech_Talk,1,20))
    for s in m:
    print(s)
  2. VencentYoung created this gist Nov 8, 2015.
    194 changes: 194 additions & 0 deletions 1024.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,194 @@
    # -*- coding:utf-8 -*-

    import urllib, http.cookiejar, requests
    import threading
    import os,re

    def get_page(url,timeout=20):
    # 仅返回网页,不做任何操作
    try:
    request = urllib.request.Request(url)
    request.add_header('Referer', cl_url)
    request.add_header('User-Agent', 'Mozilla/5.0')
    page = urllib.request.urlopen(request,timeout=timeout).read().decode('gbk')
    return page
    except:
    print('>>> 页面下载失败...%s' % url)

    def get_item(url):
    # 通过分析网页返回该页所有的帖子信息:[地址, 标题]
    # 部分帖子(尤其是达盖尔)存在颜色标记, 需要额外去除
    try:
    page = get_page(url)
    page = re.sub('[\n|\r|\t]|<font color=.+?>|</font>','',page)
    item_pattern = re.compile('(?<=<h3><a href="htm_data).+?(?=</a></h3>)')
    items = re.findall(item_pattern, page)
    res = [re.split('" target="_blank" id="">',item) for item in items]
    return res
    except:
    print('>>> 获取帖子信息失败...')

    def get_range(url, page_start, page_end):
    # 批量下载多页帖子信息,未实现多线程,需要优化
    # 返回结果同 get_item
    items = []
    for page_num in range(page_start, page_end+1):
    try:
    print('>>> 开始下载第 %d 页...' % page_num)
    items = items + get_item(url + '&page=%d' % page_num)
    print(' 第 %d 页下载成功' % page_num)
    except:
    print(' 第 %d 页下载失败' % page_num)
    finally:
    print('>>> -------------------------------')
    return items

    def search_item(key_word_list, items):
    print('>>> 目标包含 %d 条目, 按照关键词: %s 展开搜索...' % (len(items),' | '.join(key_word_list)))
    search_result = []
    for item in items:
    for key_word in key_word_list:
    if key_word in item[1]:
    search_result.append(item)
    break
    print('>>> 共搜索到 %d 个主题' % len(search_result))
    return search_result

    def get_torrent_hash(page):
    hash_pattern = re.compile('(?<=hash=).+?(?=&z">)')
    torrent_hash = re.findall(hash_pattern, page)[0]
    return torrent_hash

    def get_pic_urls(page):
    pic_pattern1 = re.compile('(?<=<input type=\'image\' src=\').+?(?=\'\s)')
    pic_pattern2 = re.compile('(?<=img src=\').+?(?=\'\s)')
    pic_urls = re.findall(pic_pattern1, page) + re.findall(pic_pattern2, page)
    # pic_pattern = re.compile('(?<=[<input type=\'image\' src=\'|img src=\']).+?(?=\'\s)')
    # pic_urls = re.findall(pic_pattern, page)
    return pic_urls

    def download_torrent(torrent_hash, torrent_name='', torrent_path=''):
    # 此处 url 对应为帖子地址
    try:
    print('>>> 开始下载种子...')
    download_url = 'http://www.rmdown.com/link.php?hash=' + torrent_hash
    torrent_server = 'http://www.rmdown.com/download.php'

    header = {'User-Agent': 'Mozilla/5.0','Referer': download_url}
    request = urllib.request.Request(download_url,headers=header)
    page= urllib.request.urlopen(request).read().decode('utf-8')
    reff_pattern = re.compile('(?<=NAME="reff" value=").+?(?="><BR>)')
    torrent_reff = re.findall(reff_pattern, page)[0]

    payload = {'ref': (None,torrent_hash),
    'reff': (None,torrent_reff),
    'submit': (None,'download')}
    response = requests.post(torrent_server, files=payload, timeout=5)

    if len(torrent_name) == 0:
    torrent_name = torrent_hash
    else:
    torrent_name = re.sub('[>/:*\|?\\<]',' - ',torrent_name)

    if len(torrent_path) != 0:
    if not(os.path.exists(torrent_path)):
    os.makedirs(torrent_path)
    file_name = os.path.join(torrent_path, torrent_name + '.torrent')
    else:
    file_name = torrent_name + '.torrent'

    with open(file_name, "wb") as code:
    code.write(response.content)
    except Exception as e:
    print('>>> 从 %s 下载 %s 失败...' %(download_url, file_name))
    print(e)
    finally:
    print('>>> -------------------------------')

    def download_pic(pic_url,pic_name='',pic_path=''):
    try:
    if len(pic_name) == 0: pic_name = re.split('/',pic_url)[-1]
    if len(pic_path) != 0:
    if not(os.path.exists(pic_path)):
    os.makedirs(pic_path)
    file_name = os.path.join(pic_path, pic_name)
    else:
    file_name = pic_name
    if os.path.isfile(file_name):
    print(' 文件已存在,无需重复下载')
    return
    r = requests.get(pic_url, timeout = 20)
    with open(file_name, "wb") as code:
    code.write(r.content)
    print(' 下载成功 %s' % pic_url)
    except Exception as e:
    print(e)
    print(' 下载失败 %s' % pic_url)

    def download_pics(pic_urls, pic_path):
    print('>>> 共 %d 张图片需要下载...' % len(pic_urls))
    task_threads = []
    for pic_url in pic_urls:
    t = threading.Thread(target = download_pic, args = (pic_url,'',pic_path))
    task_threads.append(t)
    for task in task_threads:
    task.start()
    for task in task_threads:
    task.join()

    def download_pics_from_range(url, page_start, page_end, key_word_list, save_path):
    items = get_range(url, page_start, page_end)
    matched_items = search_item(key_word_list, items)
    for i in matched_items:
    print('>>> 下载主题 %s' % i[1])
    page = get_page(cl_url+'htm_data'+i[0])
    pic_urls = get_pic_urls(page)
    print(save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
    download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))

    def download_all_from_range(url, page_start, page_end, key_word_list, save_path):
    items = get_range(url, page_start, page_end)
    matched_items = search_item(key_word_list, items)
    for i in matched_items:
    print('>>> 下载主题 %s' % i[1])
    page = get_page(cl_url+'htm_data'+i[0])
    pic_urls = get_pic_urls(page)
    torrent_hash = get_torrent_hash(page)
    download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
    download_torrent(torrent_hash, i[1], save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))

    if __name__ == '__main__':

    cl_url = 'http://你懂得/' # 定期更换
    Asia_non_mosaic = cl_url + 'thread0806.php?fid=2' # 亚洲无码
    Asia_mosaic = cl_url + 'thread0806.php?fid=15' # 亚洲有码
    Original_Western = cl_url + 'thread0806.php?fid=4' # 欧美原创
    Original_Animation = cl_url + 'thread0806.php?fid=5' # 动漫原创
    Flag_of_Daguerre = cl_url + 'thread0806.php?fid=16' # 达盖尔的旗帜
    New_Era_for_All = cl_url + 'thread0806.php?fid=8' # 新时代的我们
    Tech_Talk = cl_url + 'thread0806.php?fid=7' # 技术讨论区
    address_dic = {1: Asia_non_mosaic,
    2: Asia_mosaic,
    3: Original_Western,
    4: Original_Animation,
    5: Flag_of_Daguerre,
    6: New_Era_for_All,
    7: Tech_Talk}

    welcome_info = '''>>> 你,国之栋梁,请注意节制
    1. 亚洲无码 Asia_non_mosaic
    2. 亚洲有码 Asia_mosaic
    3. 欧美原创 Original_Western
    4. 动漫原创 Original_Animation
    5. 达盖尔的旗帜 Flag_of_Daguerre
    6. 新时代的我们 New_Era_for_All
    7. 技术讨论区 Tech_Talk
    '''
    print(welcome_info)

    save_path = 'D:\\FTPRoot'
    key_words = ['']
    m = search_item(key_words, get_range(Tech_Talk,1,20))
    for s in m:
    print(s)