Scrapy基础
框架结构
爬取豆瓣 top250 电影
安装 Scrapy
pip install Scrapy
新建项目
# 创建项目
scrapy startproject douban
# 切换到项目目录下,生成爬虫
cd spiders
scrapy genspider douban_spider movie.douban.com
明确目标
在 items.py
下 定义想要爬取的字段
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# 序号
serial_number = scrapy.Field()
# 名称
movie_name = scrapy.Field()
# 介绍
introduce = scrapy.Field()
# 星级
star = scrapy.Field()
# 评论
evaluate = scrapy.Field()
# 描述
describe = scrapy.Field()
制作爬虫
使用 Xpath 进行 html 选择。可以借助 google 浏览器扩展 Xpath Helper 进行选择器编写。
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class DoubanSpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'douban_spider'
# 允许爬取的域名
allowed_domains = ['movie.douban.com']
# 入口URL,由调度器传递到下载器
start_urls = ['https://movie.douban.com/top250']
# 解析方法
def parse(self, response):
movie_list = response.xpath("//ol[@class='grid_view']//li")
for item in movie_list:
# 导入items文件
douban_item = DoubanItem()
douban_item['serial_number'] = item.xpath(".//div[1]//div[1]//em[1]/text()").extract_first()
douban_item['movie_name'] = item.xpath(".//div[1]//div[2]//div[1]//a[1]//span[@class='title']/text()").extract_first()
content = item.xpath(".//div[1]//div[2]//div[2]//p[1]/text()").extract()
for content_item in content:
content_text = "".join(content_item.split())
douban_item['introduce'] = content_text
douban_item['star'] = item.xpath(".//span[@class='rating_num']/text()").extract_first()
douban_item['evaluate'] = item.xpath(".//div[@class='star']//span[4]/text()").extract_first()
douban_item['describe'] = item.xpath(".//p[@class='quote']//span[@class='inq']/text()").extract_first()
yield douban_item
# 获取下一页
next_link = response.xpath("//link[@rel='next']/@href").extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
运行项目
scrapy crawl douban_spider
Note 如果出现报错,可以在 settings.py
里面设置头部信息。如 USER_AGENT
等
为了方便运行,可以创建 main.py
from scrapy import cmdline
cmdline.execute('scrapy crawl douban_spider'.split())
储存数据
储存 json
scrapy crawl douban_spider -o test.json
储存 csv
scrapy crawl douban_spider -o test.csv
Note 用 Excel 打开刚生成的 csv 时,可能出现乱码,需要将文件转码盛 utf8 带 bom 的形式。
储存 MongoDB
- 在 settings.py 中 追加
mongo_host = 'localhost'
mongo_port = 27017
mongo_db_name = 'douban'
mongo_db_collection = 'douban_movie'
并取消注释
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
- 如果没有安装 pymongo , 需要进行安装
pip install pymongo
- 在 pipelines.py 中
# -*- coding: utf-8 -*-
import pymongo
from douban.settings import mongo_host, mongo_port, mongo_db_name, mongo_db_collection
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class DoubanPipeline(object):
def __init__(self):
host = mongo_host
port = mongo_port
dbname = mongo_db_name
dbcollection = mongo_db_collection
client = pymongo.MongoClient(host=host, port=port)
mydb = client[dbname]
self.post = mydb[dbcollection]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
爬虫的伪装
代理 IP
- 申请 IP 代理。这里使用阿布云
- 在 middlewares.py 中编写
import base64
.
.
.
class my_proxy(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'http-pro.abuyun.com:9010'
proxy_name_pass = b'HwSSD7VC73K55YT1P:F121AA9B2DDA7C35'
encode_name_pass = base64.b64encode(proxy_name_pass)
request.headers['Proxy-Authorization'] = 'Basic ' + encode_name_pass.decode()
- 修改 settings.py
.
.
.
DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
'douban.middlewares.my_proxy': 543,
}
.
.
.
随机设置 user-agent
- 在 middlewares.py 中编写
import random
.
.
.
class my_useragent(object):
def process_request(self, request, spider):
USER_AGENT_LIST = [
'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]
agent = random.choice(USER_AGENT_LIST)
request.headers['User-Agent'] = agent
.
.
.
- 修改 settings.py
.
.
.
DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
'douban.middlewares.my_proxy': 543,
'douban.middlewares.my_useragent': 544,
}
.
.
.
注意事项
- 中间件定义完成要在 settings.py 中启用。
- 爬虫文件名和爬虫名称不能相同,spiders 目录中不能存在相同爬虫名称的文件。
内容来源于网络或书籍