Last active
December 28, 2018 07:55
-
-
Save lefttree/5c92a2480376650567ad640016efb188 to your computer and use it in GitHub Desktop.
Revisions
-
lefttree revised this gist
Dec 28, 2018 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -111,7 +111,7 @@ def download_photo(albumInfo): #记录下载失败的相片信息,并记录在人人网文件夹下的下载失败.txt文件里 print(album['albumID']+': '+src+' failed!') error_list.append(src) with open(dir_path + '/failures.txt', 'w') as f: for i in error_list: f.write(str(error_list.index(i)) + ' ' + i + '\n') def main(): -
lefttree revised this gist
Dec 28, 2018 . 1 changed file with 0 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,12 +1,5 @@ #!/usr/bin/env python # encoding: utf-8 import re import os -
lefttree created this gist
Dec 28, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,130 @@ #!/usr/bin/env python # encoding: utf-8 """ Python: 3.6 转战python3,感觉良好,再也不用担心编码问题了oy Author: ISeeMoon Software: PyCharm File: renren.py Time: 2018/1/20 13:07 """ import re import os import json import requests from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By #手动输入用户名和密码来登陆并获取cookies login_account = input("请输入你的用户名(请加引号):") password = input("请输入你的密码(请加引号):") error_list = [] chromepath = r"/Users/bylixiang/bin/chromedriver" cookie_dic = {} chrome_opt = webdriver.ChromeOptions() prefs={"profile.managed_default_content_settings.images":2} chrome_opt.add_experimental_option("prefs",prefs) browser = webdriver.Chrome(chromepath,chrome_options=chrome_opt) browser.get('http://www.renren.com/') wait = WebDriverWait(browser,3) login = wait.until( EC.presence_of_element_located((By.XPATH,"//input[@name='email']")) ) login.send_keys(login_account) pwd = wait.until( EC.presence_of_element_located((By.XPATH,"//input[@id='password']")) ) pwd.send_keys(password) browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='savepassword clearfix']/dt/label[@class='labelCheckbox']/input[@id='autoLogin']").click() browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='bottom']/input[@id='login']").click() while 'ln_uact' not in cookie_dic.keys(): cookies = browser.get_cookies() print('登陆Cookies获取完毕...') # 将selenium获取的cookies格式转换为requests所识别的格式 for i in cookies: cookie_dic[i['name']] = i['value'] print('登陆Cookies获取完毕,准备开始抓取相片...') headers = {'Host':'photo.renren.com', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} headers1 = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'} user_id = cookie_dic['id'] albumlist_url = "http://photo.renren.com/photo/{}/albumlist/v7#".format(user_id) albumInfo = [] def get_albumInfo(): #获取每个相册的名称和url req = requests.get(albumlist_url,headers=headers,cookies=cookie_dic) res = req.text pattern = ''''albumList':\s(.*?]),''' albumlist = eval(re.findall(pattern,res)[0]) for album in albumlist: dic = {} dic['albumName'] = album['albumName'] dic['albumID'] = album['albumId'] dic['albumURL'] = "http://photo.renren.com/photo/{}/album-{}".format(user_id, album['albumId']) albumInfo.append(dic) print('相册信息获取完毕...') def get_photoURL(albumInfo): for album in albumInfo: print('开始获取相册:{} 中的照片信息...'.format(album['albumName'])) albumURL = album['albumURL'] # albumURL = 'http://photo.renren.com/photo/238355337/album-464515082/v7' albumName = album['albumName'] res_photo = requests.get(albumURL,headers=headers,cookies=cookie_dic).text pattern = "'photoList':(.*?])," photojson = json.loads(re.findall(pattern,res_photo)[0]) photoList = [] for photo in photojson: photoURL = photo['url'] photoList.append(photoURL) album['photoList'] = photoList print('相册:{} 照片信息获取完毕...'.format(album['albumName'])) def download_photo(albumInfo): current_dir_path = os.getcwd() dir_path = os.path.join(current_dir_path, "renren") print('Creating ' + dir_path) if os.path.exists(dir_path) == False: os.mkdir(dir_path) for album in albumInfo: albumpath = os.path.join(dir_path,album['albumID']) if os.path.exists(albumpath) == False: print('创建-{}-相册'.format(album['albumID'])) os.mkdir(albumpath) for album in albumInfo: #筛除空相册 if len(album['photoList']) != 0: for i in range(len(album['photoList'])): src = (album['photoList'][i]) photopath = '{}/{}/{}.jpg'.format(dir_path,album['albumID'],i) with open(photopath,'wb') as f: print(photopath, src) try: f.write(requests.get(src,headers=headers1,timeout=15).content) except: #记录下载失败的相片信息,并记录在人人网文件夹下的下载失败.txt文件里 print(album['albumID']+': '+src+' failed!') error_list.append(src) with open('/Users/bylixiang/Projects/renren/人人网/下载失败.txt', 'w') as f: for i in error_list: f.write(str(error_list.index(i)) + ' ' + i + '\n') def main(): get_albumInfo() get_photoURL(albumInfo) download_photo(albumInfo) if __name__ == '__main__': main()