Skip to content

Instantly share code, notes, and snippets.

@srsman
Forked from anonymous/zhihu_picture_downloader.py
Created October 28, 2016 15:25
Show Gist options
  • Save srsman/4f93b5c956d389959bd97e685c7b4f38 to your computer and use it in GitHub Desktop.
Save srsman/4f93b5c956d389959bd97e685c7b4f38 to your computer and use it in GitHub Desktop.

Revisions

  1. @invalid-email-address Anonymous created this gist Oct 28, 2016.
    163 changes: 163 additions & 0 deletions zhihu_picture_downloader.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,163 @@
    #-*- coding=utf-8 -*-
    """
    知乎图片下载器
    """
    import requests
    import re
    import json
    import time
    from PIL import Image
    import cStringIO
    import cookielib
    import urllib
    import os

    api_url='https://www.zhihu.com/node/QuestionAnswerListV2'
    login_url='https://www.zhihu.com/login/'
    topic_url='https://www.zhihu.com/question/'


    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }

    session=requests.Session()
    session.headers=headers
    session.cookies = cookielib.LWPCookieJar(filename='cookies')
    try:
    session.cookies.load(ignore_discard=True)
    except:
    print u"未登陆过,需先登录"


    def get_xsrf(url="http://www.zhihu.com"):
    '''''_xsrf 是一个动态变化的参数'''
    global session
    index_url = url
    index_page = session.get(index_url)
    html = index_page.content
    pattern = r'name="_xsrf" value="(.*?)"'
    _xsrf = re.findall(pattern, html)
    return _xsrf[0]


    def ImageScale(url,session=None):
    if session==None:
    session=requests.Session()
    file = cStringIO.StringIO(session.get(url).content)
    img = Image.open(file)
    img.show()


    def get_captcha():
    global session
    t=str(int(time.time()*1000))
    captcha_url='https://www.zhihu.com/captcha.gif?r=%s&type=login'%t
    print captcha_url
    ImageScale(captcha_url,session)
    print u'请输入验证码:'
    yzm=raw_input()
    return yzm

    def isLogin():
    global session
    url = "https://www.zhihu.com/settings/profile"
    login_code = session.get(url, allow_redirects=False).status_code
    if int(x=login_code) == 200:
    return True
    else:
    return False

    def login(email,passwd):
    global session
    isemail=re.search('@',email)
    if isemail:
    loginurl=login_url+'email'
    data={'_xsrf':get_xsrf()
    ,'password':passwd
    ,'remember_me':'true'
    ,'email':email}
    else:
    loginurl=login_url+'phone_num'
    data={'_xsrf':get_xsrf()
    ,'password':passwd
    ,'remember_me':'true'
    ,'phone_num':email}
    try:
    login_page=session.post(loginurl,data=data)
    login_code=login_page.content
    print login_page.status
    print login_code
    except:
    data['captcha']=get_captcha()
    login_page=session.post(loginurl,data=data)
    login_code=json.loads(login_page.content)
    print login_code['msg']
    session.cookies.save()


    def get_pic_from_topic(id,offset):
    global session
    topicurl=topic_url+str(id)
    _xsrf=get_xsrf(topicurl)
    pic_re=re.compile('data-actualsrc="(.*?)"')
    inner_data={"url_token":id
    ,"pagesize":10
    ,"offset":offset
    }
    data={'method':'next'
    ,'params':json.dumps(inner_data)
    }
    session.headers['Referer']=topicurl
    session.headers['Host']='www.zhihu.com'
    session.headers['Origin']='https://www.zhihu.com'
    session.headers['X-Xsrftoken']=_xsrf
    js_data=session.post(api_url,data=data)
    dat=json.loads(js_data.content)['msg']
    pictures=[]
    for d in dat:
    pics=pic_re.findall(d)
    pictures.extend(pics)
    return pictures

    def downloader(url,path):
    try:
    filename=url.split('/')[-1]
    save=os.path.join(path,filename)
    print u'开始下载 ',filename
    urllib.urlretrieve(url,filename=save)
    except Exception,e:
    print u'下载出错,错误信息为:'
    print e


    if __name__=='__main__':
    email='知乎账号'
    passwd='知乎密码'
    is_login=isLogin()
    if not is_login:
    login(email,passwd)
    offset=0
    pictures=[]
    print u"""####################\n# 知乎图片下载器 #\n####################
    """
    print u"请输入知乎问题id,比如https://www.zhihu.com/question/52049909,id就是52049909"
    id=input()
    print u'=====开始解析======'
    while 1:
    print u"+++++正在解析第%d页+++++"%(offset/10+1)
    pics=get_pic_from_topic(id,offset)
    if len(pics)==0:
    print u"解析完毕,共找到%d张图片"%len(pictures)
    break
    pictures.extend(pics)
    offset+=10
    print u"=====开始下载图片====="
    basepath=os.path.abspath('.')
    savepath=os.path.join(basepath,str(id))
    if not os.path.exists(savepath):
    os.mkdir(savepath)
    for pic in pictures:
    downloader(pic,savepath)
    print u"=====下载完毕====="