爬虫之知乎用户信息爬取

这个爬虫程序有别于之前写的两个,这个是自己写的,使用的是python库requests、redis、lxml。

一共有三个文件分别是config.ini用户名和cookie配置文件,zhihusp.py爬取用户名,get-info.py爬取用户其他信息。

 

下面分别将三个文件贴出来,两个python文件注释比较详细,应该都看得懂。

config.ini
[info]
phone_num = 15*********
password = ************

[cookies]
q_c1 = 5fd5e96aa1cc40f587e2fcaa621030ee|1448986627000|1448986627000
cap_id = Zjk3N2I3MjU1ZmIyNGJkNWJIDOxYmE3ZDEzN2QyOGE=|1449289675|
612bbfbnjd2e3bca76d397a2c67c921fe7c852b
_za = b7e8ab32-03b3-473b-87e6-68fe9f9e7933
__utmt = 1
__utma = 51854390.1168696635.1449128833.1449239113.1449289659.5
__utmb = 51854390.6.10.1449289659
__utmc = 51854390
__utmz = 51854390.1449223233.4.2.utmcsr=zhihu.coccn=(referral)|
utmcmd=referral|utmcct=/people/excited-vczh/followers
__utmv = 51854390.100-2|2=re=1^3=entry_date=20151202=1
z_c0 = QUJDTXpzbTNGd2tYQUFBdffabXowaVZZdHBZbnJIS3FhYjZBQnRTWllWQlZ1T
1kyc1dnPT0=|1449289708|7020f5e7c6c95b043e48c02afffb3a9c40035a77
unlock_ticket = QUJDTXpzbTNGd2tYQUFBQVlRSlZUZlJ1WWxaUDlzRGpZTVocGdn
Ul8xZkVNbDNBPT0=|1554289708|d906b57006b0cd84c58c4f6d6e1eb16e17e64

zhihusp.py 主要用户从关注着列表抓取关注者id

  1 # -*- coding: utf-8 -*-
  2 '''
  3 网络爬虫之爬取知乎用户信息
  4 '''
  5 import requests, json, re, redis, sqlite3
  6 import ConfigParser
  7 from lxml import etree
  8 import sys
  9 reload(sys)
 10 sys.setdefaultencoding("utf-8")
 11 
 12 class ZhihuSpider(object):
 13     """docstring for ZhihuSpider"""
 14     r = redis.Redis(host='127.0.0.1',port=6379,db=1)
 15     cf = ConfigParser.ConfigParser()
 16     cf.read('config.ini')
 17     cookies = cf.items('cookies')
 18     cookies = dict(cookies)
 19     session = requests.session()
 20     conn = sqlite3.connect('zhihuuser.db')
 21     conn.text_factory = str
 22     cur = conn.cursor()
 23 
 24     # 创建链接,如果使用用户名、密码登录不上,则改用cookie登录
 25     def create_session(self):
 26         
 27         from pprint import pprint
 28         pprint(self.cookies)
 29         phone_num = self.cf.get('info', 'phone_num')
 30         password = self.cf.get('info', 'password')
 31         login_data = {'phone_num': phone_num, 'password': password}
 32         header = {
 33             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 
 34             AppleWebKit/537.36 
 35             (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
 36             'Host': 'www.zhihu.com',
 37             'Referer': 'http://www.zhihu.com/'
 38         }
 39         r = self.session.post('http://www.zhihu.com/login/phone_num',
 40          data=login_data,
 41          headers=header)
 42         if r.json()['r'] == 1:
 43             print 'Login Failed, reason is:',
 44             for m in r.json()['data']:
 45                 print r.json()['data'][m]
 46             print 'So we use cookies to login in...'
 47             has_cookies = False
 48             for key in self.cookies:
 49                 if key != '__name__' and self.cookies[key] != '':
 50                     has_cookies = True
 51                     break
 52             if has_cookies is False:
 53                 raise ValueError('请填写config.ini文件中的cookies项.')
 54             else:
 55                 r=self.session.get('http://www.zhihu.com/login/phone_num',
 56                  cookies=self.cookies) # 实现cookie登陆
 57 
 58         with open('login.html', 'w') as fp:
 59             fp.write(r.content)
 60 
 61     # 请求用户关注者或关注了谁的页面
 62     def follow(self, userid):
 63         print "NOW Follow:",userid
 64         self.r.set(userid, False)
 65         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
 66         follower, followee, user_urls = self.getinfo(userid)
 67         # print user_urls
 68         for u_url in user_urls:
 69             userid = u_url.split('/')[-1]
 70             # print "FFFFFFFLLLLLLLL@*******",userid
 71             if self.not_in(userid):
 72                 self.r.set(userid, True)
 73         # print type(follower),follower
 74         if follower > 20:
 75             self.doprofiles(follower,follower_url)
 76 
 77         #提取关注的人的第一页的userid
 78         followee_url = "http://www.zhihu.com/people/"+userid+"/followees"
 79         response=self.session.get(followee_url,cookies=self.cookies)
 80             .content
 81         page = etree.HTML(response)
 82         user_urls = page.xpath('//h2/a[class="zg-link"]/@href')
 83         
 84         for u_url in user_urls:
 85             userid = u_url.split('/')[-1]
 86             # print "WWWWWWWW*****",userid
 87             if self.not_in(userid):
 88                 self.r.set(userid, True)
 89         if followee > 20:
 90             self.doprofiles(followee,followee_url)
 91 
 92     # 动态获取“更多”里面的内容
 93     def doprofiles(self,attention,url):
 94         thisheader = {
 95             'Host': 'www.zhihu.com',
 96             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)
 97              AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.124
 98               Safari/537.36',
 99             'Accept': '*/*',
100             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
101             'Accept-Encoding': 'gzip, deflate',
102             'Content-Type':'application/x-www-form-urlencoded;
103                                charset=UTF-8',
104             'X-Requested-With': 'XMLHttpRequest',
105             'Pragma': 'no-cache',
106             'Cache-Control': 'no-cache',
107             'Referer': url,
108             'Content-Length': 171,
109             'Cookie': '填自己的'
110         }
111         hash_id = '填自己的'
112         xsrf = '填自己的'
113         # 计算页数,获取更多里面的关注者信息
114         pages = attention/20 + 1
115         # if pages > 600:
116         #     pages = 600
117         for x in xrange(1,pages):
118             offset = x * 20
119             params = json.dumps({"offset":offset,
120                 "order_by":"created",
121                 "hash_id":hash_id})
122             payload = {"method":"next","params":params,"_xsrf":xsrf}
123             content = self.session.post("http://www.zhihu.com/node/
124             ProfileFollowersListV2",headers = thisheader
125             ,data = payload).content
126             load = json.loads(content)
127             # print type(load)
128             lists = load['msg']
129             for item in lists:
130                 try:
131                     userpeople = re.search(r'people/[\w+\d+-]+',item)
132                     # print userpeople
133                     if userpeople is not None:
134                         people = userpeople.group()
135                         userid = people.split('/')[-1]
136                         print "PPPPPPPPPPPPPP-------",userid
137                         if self.not_in(userid):
138                             self.r.set(userid, True)
139                 except AttributeError:
140                     print "ERROR"
141                 # self.num += 1
142         self.gofollow()
143 
144 
145     # 继续Follow
146     def gofollow(self):
147         for key in self.r.keys():
148             if self.r.get(key) == 'True':
149                 self.follow(key)
150 
151     # 检查用户名是否在redis里已经存在
152     def not_in(self, userid):
153         if self.r.exists(userid):
154             return False
155         else:
156             return True
157 
158     def getinfo(self,userid):
159         follower_url = "http://www.zhihu.com/people/"+userid+"/followers"
160         response = self.session.get(follower_url, cookies=self.cookies)
161             .content
162         page = etree.HTML(response)
163         user_urls = page.xpath('//h2/a[@class="zg-link"]/@href')
164         # 获取姓名、城市、工作、性别、教育等信息,并存入数据库
165         followee = int(page.xpath('//div[@class="zm-profile-side-
166         following zg-clear"]/a[1]/strong/text()')[0])
167         follower = int(page.xpath('//div[@class="zm-profile-side-
168         following zg-clear"]/a[2]/strong/text()')[0])
169         return follower, followee, user_urls
170 
171 if __name__ == '__main__':
172     zhihu = ZhihuSpider()
173     # 创建表
174     zhihu.cur.execute('''create table if not exists userstb
175         (userid text primary key,
176         username text, gender text, followee integer,
177         follower integer, location text,
178         business text, employment text, 
179         position text, education text, college text, 
180         question_num integer, answer_num text)''')
181     zhihu.conn.commit()
182     zhihu.create_session()
183 
184     # 几个知乎大V
185     first_users = ['excited-vczh', 'warfalcon','gejinyuban']
186     for user in first_users:
187         if zhihu.r.exists(user):
188             continue
189         else:
190             zhihu.follow(user)
191     # 从redis里面查找没有被follow的用户id
192     for key in zhihu.r.keys():
193         if zhihu.r.exists(key):
194             if zhihu.r.get(key)=='True':
195                 zhihu.follow(key)

get-info.py 主要访问每个id的主页,提取信息

  1 # -*- coding: utf-8 -*-
  2 '''
  3 网络爬虫之爬取知乎用户信息
  4 '''
  5 import requests, json, re, redis, sqlite3
  6 import ConfigParser
  7 from lxml import etree
  8 from time import ctime
  9 import sys
 10 reload(sys)
 11 sys.setdefaultencoding("utf-8")
 12 
 13 class GetInfo(object):
 14     """docstring for GetInfo"""
 15 
 16     r1 = redis.Redis(host='127.0.0.1',port=6379,db=1)
 17     r2 = redis.Redis(host='127.0.0.1',port=6379,db=2)
 18     cf = ConfigParser.ConfigParser()
 19     cf.read('config.ini')
 20     cookies = cf.items('cookies')
 21     cookies = dict(cookies)
 22     session = requests.session()
 23     conn = sqlite3.connect('zhihuuser.db')
 24     cur = conn.cursor()
 25     itemlist = []
 26     useridlist = []
 27     flag = 0
 28 
 29     # 请求用户主页,获取信息,并存入数据库
 30     def getinfo(self, userid):
 31         url = "http://www.zhihu.com/people/"+userid
 32         print "GET:%s---%s" %(userid,ctime())
 33 
 34         # 异常处理,必要!!
 35         try:
 36             response = self.session.get(url,cookies=self.cookies).content
 37             page = etree.HTML(response)
 38             username = page.xpath('//div[@class="title-section ellipsis"]
 39             /span[@class="name"]/text()')[0]
 40             location = page.xpath('//div[@data-name="location"]/span
 41             /span[@class="location item"]/@title')
 42             business = page.xpath('//div[@data-name="location"]/span
 43             /span[@class="business item"]/@title')
 44             gendertit = page.xpath('//div[@data-name="location"]/span
 45             /span[@class="item gender"]/i/@class')
 46             # 没办法直接取出性别,曲线救国
 47             if len(gendertit)==0:
 48                 gender = 'notsure'
 49             elif re.search(r'female', gendertit[0]):
 50                 gender = u''
 51             else:
 52                 gender = u''
 53             employment = page.xpath('//div[@data-name="employment"]
 54                 /span/span[@class="employment item"]/@title')
 55             position = page.xpath('//div[@data-name="employment"]
 56                 /span/span[@class="position item"]/@title')
 57             education = page.xpath('//div[@data-name="education"]
 58                 /span/span[@class="education item"]/@title')
 59             college = page.xpath('//div[@data-name="education"]
 60                 /span/span[@class="education-extra item"]/@title')
 61             followee = int(page.xpath('//div[@class="zm-profile-side-
 62                 following zg-clear"]/a[1]/strong/text()')[0])
 63             follower = int(page.xpath('//div[@class="zm-profile-side-
 64                 following zg-clear"]/a[2]/strong/text()')[0])
 65             question_num = int(page.xpath('//div[@class="profile-navba
 66             r clearfix"]/a[2]/span/text()')[0])
 67             answer_num =int(page.xpath('//div[@class="profile-navbar
 68              clearfix"]/a[3]/span/text()')[0])
 69 
 70             # 有些字段用户没有填写,所以需要判断是否为空
 71             if len(location) == 0:
 72                 location = None
 73             else:
 74                 location = location[0]
 75             if len(business) == 0:
 76                 business = None
 77             else:
 78                 business = business[0]
 79             if len(employment) == 0:
 80                 employment = None
 81             else:
 82                 employment = employment[0]
 83             if len(position) == 0:
 84                 position = None
 85             else:
 86                 position = position[0]
 87             if len(education) == 0:
 88                 education = None
 89             else:
 90                 education = education[0]
 91             if len(college) == 0:
 92                 college = None
 93             else:
 94                 college = college[0]
 95 
 96             # 存入数据库并提交
 97             item = (userid,username,gender,followee,follower,
 98             location,business,employment,position,education,
 99             college,question_num,answer_num)
100             print userid,username
101             has_in = self.cur.execute("insert into userstb 
102                 values(?,?,?,?,?,?,?,?,?,?,?,?,?)",item)
103             self.conn.commit()
104             if has_in:
105                 print u"存入成功"
106                 self.r2.set(userid,True)
107             else:
108                 print u"存入失败"
109         except requests.exceptions.RequestException:
110             print u'连接异常'
111             self.main()
112         except Exception:
113             self.r2.set(userid,True)
114             self.main()
115 
116     # 主循环,从redis里面取出没有查询过的
117     def main(self):
118         while True:
119             for key in self.r1.keys():
120                 if self.r2.exists(key):
121                     continue
122                 else:
123                     self.getinfo(key)
124 
125 if __name__ == '__main__':
126     begin = GetInfo()
127     begin.main()

GG

 

posted @ 2015-12-08 20:56  phil_chow  阅读(2495)  评论(0编辑  收藏  举报