# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from urllib import parse
import time
import json
from scrapy import Request, FormRequest
from SumSpider.items import ZhihuUserItem
# start_url = "https://www.zhihu.com/api/v4/members/zhu-yun-bei-79?include=allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
# start_url = "https://www.zhihu.com/api/v4/members/excited-vczh/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=40&limit=20"
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
user = "excited-vczh"
# 每个关注的人的数据url (json格式)
user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}"
user_query = "allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
# 每页关注的人的列表url
follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}"
follows_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
# 每页关注我的人的列表url
followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}"
followers_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"
#入口
def start_requests(self):
yield Request(url=self.user_url.format(user=self.user, include=self.user_query), callback=self.parse_user)
yield Request(url=self.follows_url.format(user=self.user, include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
yield Request(url=self.followers_url.format(user=self.user, include=self.followers_query, offset=0, limit=20),callback=self.parse_followers)
# 解析每个人的数据
def parse_user(self, response):
item = ZhihuUserItem()
res_dict = json.loads(response.text)
for field in item.fields:
if field in res_dict.keys():
item[field] = res_dict.get(field)
yield item
yield Request(url=self.follows_url.format(user=res_dict.get("url_token"), include=self.follows_query, offset=0, limit=20), callback=self.parse_follows)
yield Request(url=self.followers_url.format(user=res_dict.get("url_token"), include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
# 关注的人的列表 (拿到url_token 和 翻页)
def parse_follows(self, response):
res_dict = json.loads(response.text)
if "data" in res_dict.keys():
for i in res_dict.get("data"):
if "url_token" in i.keys():
# print(i.get("url_token"))
yield Request(url=self.user_url.format(user=i.get("url_token"), include=self.user_query), callback=self.parse_user)
if "paging" in res_dict.keys() and res_dict.get("paging").get("is_end") is False:
next_page = res_dict.get("paging").get("next")
yield Request(url=next_page, callback=self.parse_follows)
# 关注我的人的列表 (拿到url_token 和 翻页)
def parse_followers(self, response):
res_dict = json.loads(response.text)
if "data" in res_dict.keys():
for i in res_dict.get("data"):
if "url_token" in i.keys():
# print(i.get("url_token"))
yield Request(url=self.user_url.format(user=i.get("url_token"), include=self.user_query), callback=self.parse_user)
if "paging" in res_dict.keys() and res_dict.get("paging").get("is_end") is False:
next_page = res_dict.get("paging").get("next")
yield Request(url=next_page, callback=self.parse_followers)