Scrapy（一）爬知乎所有用户信息

一、Scrapy安装(Windows）。

需要到https://www.lfd.uci.edu/~gohlke/pythonlibs/下载Twisted手动安装。
pip install pywin32.
pip install scrapy.

　　在CMD下执行以下命令验证是否安装完成。

scrapy startproject myjob
cd myjob
scrapy genspider baidu www.baidu.com
scrapy crawl baidu

　　没有报错说明安装成功。

二、开始一个project

　　scrapy startproject zhihu

　　查看相应目录，在pycharm打开。

　　setting.py

# 默认为True，这个规则可能会影响爬取，先改成False
ROBOTSTXT_OBEY = False

# 把Headers打开
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}

　　items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field

class UsersItem(Item):
    answer_count = Field()
    articles_count = Field()
    avatar_url = Field()
    avatar_url_template = Field()
    badge = Field()
    follower_count = Field()
    gender = Field()
    headline = Field()
    id = Field()
    is_advertiser = Field()
    is_followed = Field()
    is_following = Field()
    is_org = Field()
    name = Field()
    type = Field()
    url = Field()
    url_token = Field()
    user_type = Field()

　　zhihu.py

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItem
f = open('F:\zhihu.txt', 'w', encoding='utf-8')

class UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        # 起始用戶頁，關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏作為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 可能是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，所以就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

直接在pycharm上的terminal运行

打开zhihu.txt看到确实有抓到很多用户，而且没有使用代理试跑了半小时也没有被知乎弹验证码阻止。

三、同时爬取粉丝页并传入Mongodb

　　前面是从我关注的人页面爬url_token，当然也可以同时从关注我的人页面爬url_token，只需要在followees的url下面加一个followers的url，include都是一样的不用改，再加一个follower_pase函数就好了。

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItemclass UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    # 用户信息url
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    # 他关注的人url
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    # 关注他的人url
    follower_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'


    def start_requests(self):
        # 起始用戶頁，關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follower_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follower_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏作為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 可能是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，所以就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

    def follower_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏作為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 可能是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，所以就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follower_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follower_parse)

　　把结果保存到Mongdb中，从scrapy官方文档中copy Mongodb的Demo，直接用就好了，就是写入数据库的代码要改一下，在存入前去重。

　　官方文档https://doc.scrapy.org/en/latest/topics/item-pipeline.html

　　pipelines.py

import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db['users'].update({'url_token': item['url_token']}, {'$set': item}, True)
        return item

　　settings.py把pipelines打开，并加上Mongodb的配置

ITEM_PIPELINES = {
   'first_scrapy.pipelines.MongoPipeline': 300,
}


MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'

　　为什么Mongodb用的是这两个变量名？因为MongoPipline里面写的是从setting里面拿这两个变量，所以必须是这两个变量名，如果要更改需要连pipelines里面的名字一起改。

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

　　Mongodb上的结果

　　我在一个地方没用代理验证了半小时，抓到了2w多条数据都没有出现验证码，在另一个地方用代理爬了4000多条数据就出现验证码的问题了，验证码问题可以用Flask维护的动态代理池来解决。

posted @ 2018-11-02 16:51 放脚一搏阅读(246) 评论(0) 编辑收藏举报

刷新页面返回顶部

放脚一搏

Scrapy（一）爬知乎所有用户信息

公告