Skip to content

Instantly share code, notes, and snippets.

@evi1m0
Created April 10, 2015 09:36
Show Gist options
  • Save evi1m0/a3cc41690c69bce02ed3 to your computer and use it in GitHub Desktop.
Save evi1m0/a3cc41690c69bce02ed3 to your computer and use it in GitHub Desktop.

Revisions

  1. evi1m0 created this gist Apr 10, 2015.
    57 changes: 57 additions & 0 deletions hi_baidu_spider.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    #!/usr/bin/env python
    # coding=utf8
    # author=evi1m0#n0tr00t
    # Fri Apr 10 14:14:35 2015

    import os
    import re
    import sys
    import wget
    import requests
    import urlparse
    import threadpool as tp

    def _archives(author):
    archives_url = 'http://hi.baidu.com/{}/archive'.format(author)
    print '[*] Target URL: {}'.format(archives_url)
    year_content = requests.get(archives_url).content
    years = re.findall('<div class=fi-list id=fiList>(.*?)</section>', year_content)[0]
    months = re.findall('<a href="(.*?)" class="fi-border-bt2', years)
    print '[*] Months count: {}'.format(len(months))
    months_url = []
    archives_list = []
    for month in months:
    if 'month=' in urlparse.urlparse(month).query:
    months_url.append(month)
    for url in months_url:
    month_content = requests.get(url).content
    urls = re.findall('</div><a href="(.*?)" class=info-detail target=_blank>', month_content)
    for u in urls:
    archives_list.append(u)
    return archives_list

    def main(url):
    _page = requests.get(url).content
    _title = re.findall('<h2 class="title content-title">(.*?)</h2>', _page)[0]
    _filename = '{author}/{title}'.format(author=sys.argv[1], title=_title)
    print '[+] Download: {}'.format(_title)
    try:
    wget.download(url, out=_filename, bar='')
    except Exception, e:
    print '[-] Error: ' + str(e)

    if __name__ == '__main__':
    if len(sys.argv) == 1:
    print '[-] Usage: {} Blog_name'.format(sys.argv[0])
    print '[-] Example: {} evi1m0'.format(sys.argv[0])
    sys.exit()
    author = sys.argv[1]
    if not os.path.exists(author):
    os.mkdir(author)
    archives = _archives(author)
    print '[*] Archives statistics: {}'.format(len(archives))
    # threadpool
    pool = tp.ThreadPool(30)
    reqs = tp.makeRequests(main, archives)
    [pool.putRequest(req) for req in reqs]
    pool.wait()