Python爬虫 —— 知乎之selenium模拟登陆获取cookies+requests.Session()访问+session序列化
代码如下:
1 # coding:utf-8 2 from selenium import webdriver 3 import requests 4 import sys 5 import time 6 from lxml import etree 7 import cPickle 8 import os 9 # reload(sys) 10 # sys.setdefaultencoding('utf-8') 11 12 class Zhihu: 13 def __init__(self,homeurl): 14 self.homeurl = homeurl 15 16 def save_session(self,session): #保存session,下次可直接使用,避免再次登录 17 with open('session.txt','wb') as f: 18 cPickle.dump(session, f) 19 print "Cookies have been writed." 20 21 def load_session(self): #加载session 22 with open('session.txt', 'rb') as f: 23 s = cPickle.load(f) 24 return s 25 26 def GetCookies(self): #初次登录用selenium模拟,并获得cookies 27 browser = webdriver.Chrome() 28 browser.get("https://www.zhihu.com/signin") 29 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("13060882373") 30 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("xxxxxx") 31 browser.find_element_by_xpath("//main//div[2]/div[1]/form/button").click() 32 time.sleep(10) 33 cookies = browser.get_cookies() 34 browser.quit() 35 return cookies 36 37 def get_session(self): #获取session 38 s = requests.Session() 39 if not os.path.exists('session.txt'): #如果没有session,则创建一个,并且保存到文件中 40 s.headers.clear() 41 for cookie in self.GetCookies(): 42 s.cookies.set(cookie['name'], cookie['value']) 43 self.save_session(s) 44 else: #如果已存在session,则直接加载使用 45 s = self.load_session() 46 return s 47 48 def Crawl(self): #开始爬取 49 s = self.get_session() 50 html = s.get(self.homeurl).text 51 html_tree = etree.HTML(html) 52 items = html_tree.xpath('//main//div[1]/div[2]//div[@class="ContentItem AnswerItem"]/@data-zop') 53 for item in items: 54 content = eval(item) 55 authorName = content['authorName'] 56 title = content['title'] 57 print authorName + "回答了:" + title 58 59 zhihu = Zhihu('https://www.zhihu.com/') 60 zhihu.Crawl()