爬虫之知乎用户信息爬取
这个爬虫程序有别于之前写的两个,这个是自己写的,使用的是python库requests、redis、lxml。
一共有三个文件分别是config.ini用户名和cookie配置文件,zhihusp.py爬取用户名,get-info.py爬取用户其他信息。
下面分别将三个文件贴出来,两个python文件注释比较详细,应该都看得懂。
config.ini
[info] phone_num = 15********* password = ************ [cookies] q_c1 = 5fd5e96aa1cc40f587e2fcaa621030ee|1448986627000|1448986627000 cap_id = Zjk3N2I3MjU1ZmIyNGJkNWJIDOxYmE3ZDEzN2QyOGE=|1449289675| 612bbfbnjd2e3bca76d397a2c67c921fe7c852b _za = b7e8ab32-03b3-473b-87e6-68fe9f9e7933 __utmt = 1 __utma = 51854390.1168696635.1449128833.1449239113.1449289659.5 __utmb = 51854390.6.10.1449289659 __utmc = 51854390 __utmz = 51854390.1449223233.4.2.utmcsr=zhihu.coccn=(referral)| utmcmd=referral|utmcct=/people/excited-vczh/followers __utmv = 51854390.100-2|2=re=1^3=entry_date=20151202=1 z_c0 = QUJDTXpzbTNGd2tYQUFBdffabXowaVZZdHBZbnJIS3FhYjZBQnRTWllWQlZ1T 1kyc1dnPT0=|1449289708|7020f5e7c6c95b043e48c02afffb3a9c40035a77 unlock_ticket = QUJDTXpzbTNGd2tYQUFBQVlRSlZUZlJ1WWxaUDlzRGpZTVocGdn Ul8xZkVNbDNBPT0=|1554289708|d906b57006b0cd84c58c4f6d6e1eb16e17e64
zhihusp.py 主要用户从关注着列表抓取关注者id
1 # -*- coding: utf-8 -*- 2 ''' 3 网络爬虫之爬取知乎用户信息 4 ''' 5 import requests, json, re, redis, sqlite3 6 import ConfigParser 7 from lxml import etree 8 import sys 9 reload(sys) 10 sys.setdefaultencoding("utf-8") 11 12 class ZhihuSpider(object): 13 """docstring for ZhihuSpider""" 14 r = redis.Redis(host='127.0.0.1',port=6379,db=1) 15 cf = ConfigParser.ConfigParser() 16 cf.read('config.ini') 17 cookies = cf.items('cookies') 18 cookies = dict(cookies) 19 session = requests.session() 20 conn = sqlite3.connect('zhihuuser.db') 21 conn.text_factory = str 22 cur = conn.cursor() 23 24 # 创建链接,如果使用用户名、密码登录不上,则改用cookie登录 25 def create_session(self): 26 27 from pprint import pprint 28 pprint(self.cookies) 29 phone_num = self.cf.get('info', 'phone_num') 30 password = self.cf.get('info', 'password') 31 login_data = {'phone_num': phone_num, 'password': password} 32 header = { 33 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 34 AppleWebKit/537.36 35 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', 36 'Host': 'www.zhihu.com', 37 'Referer': 'http://www.zhihu.com/' 38 } 39 r = self.session.post('http://www.zhihu.com/login/phone_num', 40 data=login_data, 41 headers=header) 42 if r.json()['r'] == 1: 43 print 'Login Failed, reason is:', 44 for m in r.json()['data']: 45 print r.json()['data'][m] 46 print 'So we use cookies to login in...' 47 has_cookies = False 48 for key in self.cookies: 49 if key != '__name__' and self.cookies[key] != '': 50 has_cookies = True 51 break 52 if has_cookies is False: 53 raise ValueError('请填写config.ini文件中的cookies项.') 54 else: 55 r=self.session.get('http://www.zhihu.com/login/phone_num', 56 cookies=self.cookies) # 实现cookie登陆 57 58 with open('login.html', 'w') as fp: 59 fp.write(r.content) 60 61 # 请求用户关注者或关注了谁的页面 62 def follow(self, userid): 63 print "NOW Follow:",userid 64 self.r.set(userid, False) 65 follower_url = "http://www.zhihu.com/people/"+userid+"/followers" 66 follower, followee, user_urls = self.getinfo(userid) 67 # print user_urls 68 for u_url in user_urls: 69 userid = u_url.split('/')[-1] 70 # print "FFFFFFFLLLLLLLL@*******",userid 71 if self.not_in(userid): 72 self.r.set(userid, True) 73 # print type(follower),follower 74 if follower > 20: 75 self.doprofiles(follower,follower_url) 76 77 #提取关注的人的第一页的userid 78 followee_url = "http://www.zhihu.com/people/"+userid+"/followees" 79 response=self.session.get(followee_url,cookies=self.cookies) 80 .content 81 page = etree.HTML(response) 82 user_urls = page.xpath('//h2/a[class="zg-link"]/@href') 83 84 for u_url in user_urls: 85 userid = u_url.split('/')[-1] 86 # print "WWWWWWWW*****",userid 87 if self.not_in(userid): 88 self.r.set(userid, True) 89 if followee > 20: 90 self.doprofiles(followee,followee_url) 91 92 # 动态获取“更多”里面的内容 93 def doprofiles(self,attention,url): 94 thisheader = { 95 'Host': 'www.zhihu.com', 96 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) 97 AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.2357.124 98 Safari/537.36', 99 'Accept': '*/*', 100 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 101 'Accept-Encoding': 'gzip, deflate', 102 'Content-Type':'application/x-www-form-urlencoded; 103 charset=UTF-8', 104 'X-Requested-With': 'XMLHttpRequest', 105 'Pragma': 'no-cache', 106 'Cache-Control': 'no-cache', 107 'Referer': url, 108 'Content-Length': 171, 109 'Cookie': '填自己的' 110 } 111 hash_id = '填自己的' 112 xsrf = '填自己的' 113 # 计算页数,获取更多里面的关注者信息 114 pages = attention/20 + 1 115 # if pages > 600: 116 # pages = 600 117 for x in xrange(1,pages): 118 offset = x * 20 119 params = json.dumps({"offset":offset, 120 "order_by":"created", 121 "hash_id":hash_id}) 122 payload = {"method":"next","params":params,"_xsrf":xsrf} 123 content = self.session.post("http://www.zhihu.com/node/ 124 ProfileFollowersListV2",headers = thisheader 125 ,data = payload).content 126 load = json.loads(content) 127 # print type(load) 128 lists = load['msg'] 129 for item in lists: 130 try: 131 userpeople = re.search(r'people/[\w+\d+-]+',item) 132 # print userpeople 133 if userpeople is not None: 134 people = userpeople.group() 135 userid = people.split('/')[-1] 136 print "PPPPPPPPPPPPPP-------",userid 137 if self.not_in(userid): 138 self.r.set(userid, True) 139 except AttributeError: 140 print "ERROR" 141 # self.num += 1 142 self.gofollow() 143 144 145 # 继续Follow 146 def gofollow(self): 147 for key in self.r.keys(): 148 if self.r.get(key) == 'True': 149 self.follow(key) 150 151 # 检查用户名是否在redis里已经存在 152 def not_in(self, userid): 153 if self.r.exists(userid): 154 return False 155 else: 156 return True 157 158 def getinfo(self,userid): 159 follower_url = "http://www.zhihu.com/people/"+userid+"/followers" 160 response = self.session.get(follower_url, cookies=self.cookies) 161 .content 162 page = etree.HTML(response) 163 user_urls = page.xpath('//h2/a[@class="zg-link"]/@href') 164 # 获取姓名、城市、工作、性别、教育等信息,并存入数据库 165 followee = int(page.xpath('//div[@class="zm-profile-side- 166 following zg-clear"]/a[1]/strong/text()')[0]) 167 follower = int(page.xpath('//div[@class="zm-profile-side- 168 following zg-clear"]/a[2]/strong/text()')[0]) 169 return follower, followee, user_urls 170 171 if __name__ == '__main__': 172 zhihu = ZhihuSpider() 173 # 创建表 174 zhihu.cur.execute('''create table if not exists userstb 175 (userid text primary key, 176 username text, gender text, followee integer, 177 follower integer, location text, 178 business text, employment text, 179 position text, education text, college text, 180 question_num integer, answer_num text)''') 181 zhihu.conn.commit() 182 zhihu.create_session() 183 184 # 几个知乎大V 185 first_users = ['excited-vczh', 'warfalcon','gejinyuban'] 186 for user in first_users: 187 if zhihu.r.exists(user): 188 continue 189 else: 190 zhihu.follow(user) 191 # 从redis里面查找没有被follow的用户id 192 for key in zhihu.r.keys(): 193 if zhihu.r.exists(key): 194 if zhihu.r.get(key)=='True': 195 zhihu.follow(key)
get-info.py 主要访问每个id的主页,提取信息
1 # -*- coding: utf-8 -*- 2 ''' 3 网络爬虫之爬取知乎用户信息 4 ''' 5 import requests, json, re, redis, sqlite3 6 import ConfigParser 7 from lxml import etree 8 from time import ctime 9 import sys 10 reload(sys) 11 sys.setdefaultencoding("utf-8") 12 13 class GetInfo(object): 14 """docstring for GetInfo""" 15 16 r1 = redis.Redis(host='127.0.0.1',port=6379,db=1) 17 r2 = redis.Redis(host='127.0.0.1',port=6379,db=2) 18 cf = ConfigParser.ConfigParser() 19 cf.read('config.ini') 20 cookies = cf.items('cookies') 21 cookies = dict(cookies) 22 session = requests.session() 23 conn = sqlite3.connect('zhihuuser.db') 24 cur = conn.cursor() 25 itemlist = [] 26 useridlist = [] 27 flag = 0 28 29 # 请求用户主页,获取信息,并存入数据库 30 def getinfo(self, userid): 31 url = "http://www.zhihu.com/people/"+userid 32 print "GET:%s---%s" %(userid,ctime()) 33 34 # 异常处理,必要!! 35 try: 36 response = self.session.get(url,cookies=self.cookies).content 37 page = etree.HTML(response) 38 username = page.xpath('//div[@class="title-section ellipsis"] 39 /span[@class="name"]/text()')[0] 40 location = page.xpath('//div[@data-name="location"]/span 41 /span[@class="location item"]/@title') 42 business = page.xpath('//div[@data-name="location"]/span 43 /span[@class="business item"]/@title') 44 gendertit = page.xpath('//div[@data-name="location"]/span 45 /span[@class="item gender"]/i/@class') 46 # 没办法直接取出性别,曲线救国 47 if len(gendertit)==0: 48 gender = 'notsure' 49 elif re.search(r'female', gendertit[0]): 50 gender = u'女' 51 else: 52 gender = u'男' 53 employment = page.xpath('//div[@data-name="employment"] 54 /span/span[@class="employment item"]/@title') 55 position = page.xpath('//div[@data-name="employment"] 56 /span/span[@class="position item"]/@title') 57 education = page.xpath('//div[@data-name="education"] 58 /span/span[@class="education item"]/@title') 59 college = page.xpath('//div[@data-name="education"] 60 /span/span[@class="education-extra item"]/@title') 61 followee = int(page.xpath('//div[@class="zm-profile-side- 62 following zg-clear"]/a[1]/strong/text()')[0]) 63 follower = int(page.xpath('//div[@class="zm-profile-side- 64 following zg-clear"]/a[2]/strong/text()')[0]) 65 question_num = int(page.xpath('//div[@class="profile-navba 66 r clearfix"]/a[2]/span/text()')[0]) 67 answer_num =int(page.xpath('//div[@class="profile-navbar 68 clearfix"]/a[3]/span/text()')[0]) 69 70 # 有些字段用户没有填写,所以需要判断是否为空 71 if len(location) == 0: 72 location = None 73 else: 74 location = location[0] 75 if len(business) == 0: 76 business = None 77 else: 78 business = business[0] 79 if len(employment) == 0: 80 employment = None 81 else: 82 employment = employment[0] 83 if len(position) == 0: 84 position = None 85 else: 86 position = position[0] 87 if len(education) == 0: 88 education = None 89 else: 90 education = education[0] 91 if len(college) == 0: 92 college = None 93 else: 94 college = college[0] 95 96 # 存入数据库并提交 97 item = (userid,username,gender,followee,follower, 98 location,business,employment,position,education, 99 college,question_num,answer_num) 100 print userid,username 101 has_in = self.cur.execute("insert into userstb 102 values(?,?,?,?,?,?,?,?,?,?,?,?,?)",item) 103 self.conn.commit() 104 if has_in: 105 print u"存入成功" 106 self.r2.set(userid,True) 107 else: 108 print u"存入失败" 109 except requests.exceptions.RequestException: 110 print u'连接异常' 111 self.main() 112 except Exception: 113 self.r2.set(userid,True) 114 self.main() 115 116 # 主循环,从redis里面取出没有查询过的 117 def main(self): 118 while True: 119 for key in self.r1.keys(): 120 if self.r2.exists(key): 121 continue 122 else: 123 self.getinfo(key) 124 125 if __name__ == '__main__': 126 begin = GetInfo() 127 begin.main()
GG