Last active
December 28, 2018 07:55
-
-
Save lefttree/5c92a2480376650567ad640016efb188 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # encoding: utf-8 | |
| import re | |
| import os | |
| import json | |
| import requests | |
| from selenium import webdriver | |
| from selenium.webdriver.support.wait import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.common.by import By | |
| #手动输入用户名和密码来登陆并获取cookies | |
| login_account = input("请输入你的用户名(请加引号):") | |
| password = input("请输入你的密码(请加引号):") | |
| error_list = [] | |
| chromepath = r"/Users/bylixiang/bin/chromedriver" | |
| cookie_dic = {} | |
| chrome_opt = webdriver.ChromeOptions() | |
| prefs={"profile.managed_default_content_settings.images":2} | |
| chrome_opt.add_experimental_option("prefs",prefs) | |
| browser = webdriver.Chrome(chromepath,chrome_options=chrome_opt) | |
| browser.get('http://www.renren.com/') | |
| wait = WebDriverWait(browser,3) | |
| login = wait.until( | |
| EC.presence_of_element_located((By.XPATH,"//input[@name='email']")) | |
| ) | |
| login.send_keys(login_account) | |
| pwd = wait.until( | |
| EC.presence_of_element_located((By.XPATH,"//input[@id='password']")) | |
| ) | |
| pwd.send_keys(password) | |
| browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='savepassword clearfix']/dt/label[@class='labelCheckbox']/input[@id='autoLogin']").click() | |
| browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='bottom']/input[@id='login']").click() | |
| while 'ln_uact' not in cookie_dic.keys(): | |
| cookies = browser.get_cookies() | |
| print('登陆Cookies获取完毕...') | |
| # 将selenium获取的cookies格式转换为requests所识别的格式 | |
| for i in cookies: | |
| cookie_dic[i['name']] = i['value'] | |
| print('登陆Cookies获取完毕,准备开始抓取相片...') | |
| headers = {'Host':'photo.renren.com', | |
| 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} | |
| headers1 = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'} | |
| user_id = cookie_dic['id'] | |
| albumlist_url = "http://photo.renren.com/photo/{}/albumlist/v7#".format(user_id) | |
| albumInfo = [] | |
| def get_albumInfo(): | |
| #获取每个相册的名称和url | |
| req = requests.get(albumlist_url,headers=headers,cookies=cookie_dic) | |
| res = req.text | |
| pattern = ''''albumList':\s(.*?]),''' | |
| albumlist = eval(re.findall(pattern,res)[0]) | |
| for album in albumlist: | |
| dic = {} | |
| dic['albumName'] = album['albumName'] | |
| dic['albumID'] = album['albumId'] | |
| dic['albumURL'] = "http://photo.renren.com/photo/{}/album-{}".format(user_id, album['albumId']) | |
| albumInfo.append(dic) | |
| print('相册信息获取完毕...') | |
| def get_photoURL(albumInfo): | |
| for album in albumInfo: | |
| print('开始获取相册:{} 中的照片信息...'.format(album['albumName'])) | |
| albumURL = album['albumURL'] | |
| # albumURL = 'http://photo.renren.com/photo/238355337/album-464515082/v7' | |
| albumName = album['albumName'] | |
| res_photo = requests.get(albumURL,headers=headers,cookies=cookie_dic).text | |
| pattern = "'photoList':(.*?])," | |
| photojson = json.loads(re.findall(pattern,res_photo)[0]) | |
| photoList = [] | |
| for photo in photojson: | |
| photoURL = photo['url'] | |
| photoList.append(photoURL) | |
| album['photoList'] = photoList | |
| print('相册:{} 照片信息获取完毕...'.format(album['albumName'])) | |
| def download_photo(albumInfo): | |
| current_dir_path = os.getcwd() | |
| dir_path = os.path.join(current_dir_path, "renren") | |
| print('Creating ' + dir_path) | |
| if os.path.exists(dir_path) == False: | |
| os.mkdir(dir_path) | |
| for album in albumInfo: | |
| albumpath = os.path.join(dir_path,album['albumID']) | |
| if os.path.exists(albumpath) == False: | |
| print('创建-{}-相册'.format(album['albumID'])) | |
| os.mkdir(albumpath) | |
| for album in albumInfo: | |
| #筛除空相册 | |
| if len(album['photoList']) != 0: | |
| for i in range(len(album['photoList'])): | |
| src = (album['photoList'][i]) | |
| photopath = '{}/{}/{}.jpg'.format(dir_path,album['albumID'],i) | |
| with open(photopath,'wb') as f: | |
| print(photopath, src) | |
| try: | |
| f.write(requests.get(src,headers=headers1,timeout=15).content) | |
| except: | |
| #记录下载失败的相片信息,并记录在人人网文件夹下的下载失败.txt文件里 | |
| print(album['albumID']+': '+src+' failed!') | |
| error_list.append(src) | |
| with open(dir_path + '/failures.txt', 'w') as f: | |
| for i in error_list: | |
| f.write(str(error_list.index(i)) + ' ' + i + '\n') | |
| def main(): | |
| get_albumInfo() | |
| get_photoURL(albumInfo) | |
| download_photo(albumInfo) | |
| if __name__ == '__main__': | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Credit to https://www.jianshu.com/p/2379ef53112f