爬取豆瓣电影TOP 250的电影存储到mongodb中
爬取豆瓣电影TOP 250的电影存储到mongodb中
1.创建项目sp1
PS D:\scrapy> scrapy.exe startproject douban
2.创建一个爬虫
PS D:\scrapy\tencent> scrapy genspider doubanmovie "movie.douban.com"
3.编辑爬虫内容doubanmovie.py
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem
class DoubanmovieSpider(scrapy.Spider):
name = 'doubanmovie'
allowed_domains = ['movie.douban.com']
offset = 0
base_url = "https://movie.douban.com/top250?start="
start_urls = [base_url + str(offset), ]
def parse(self, response):
movies = response.xpath("//div[@class='info']")
for i in movies:
# 标题
title = i.xpath(".//span[@class='title'][1]/text()").extract()[0].strip()
# 信息
bd = i.xpath(".//div[@class='bd']/p/text()").extract()[0].strip()
# 评分
star = i.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0].strip()
# 简介
quote = i.xpath(".//p[@class='quote']/span/text()").extract()
if quote:
quote = quote[0].strip()
yield DoubanItem(title=title, bd=bd, star=star, quote=quote)
# 当url页面小与 225 时让URL继续翻页
if self.offset < 225:
self.offset += 25
yield scrapy.Request(self.base_url + str(self.offset), callback=self.parse)
4 .设置items.py 的内容接收格式
import scrapy
class DoubanItem(scrapy.Item):
# 标题
title = scrapy.Field()
# 信息
bd = scrapy.Field()
# 评分
star = scrapy.Field()
# 简介
quote = scrapy.Field()
5 .settings.py 中设置将格式化后的数据传给 pipelines 处理保存
# 下载后的文件如何处理
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
# 头部信息
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 没有 User-Agent 会报 403 错误
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
}
# MONGODB 主机配置
MONGODB_HOST = "192.8.11.100"
MONGODB_PORT = 27017
MONGODB_DBNAME = "Douban"
MONGODB_SHEETNAME = "douban_top_250"
6 .pipelines.py设置保存数据到Mongodb数据库中
import pymongo
from scrapy.conf import settings
class DoubanPipeline(object):
def __init__(self):
# 获取settings 中 mongo 的配置信息
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheeiname = settings["MONGODB_SHEETNAME"]
# 创建MongoDB数据库连接
client = pymongo.MongoClient(host=host, port=port,)
# 连接密码(待测试)
# client.authenticate("用户", "密码")
# 指定数据库
mydb = client[dbname]
# 存放数据的数据库表名
self.sheet = mydb[sheeiname]
def process_item(self, item, spider):
data = dict(item)
self.sheet.insert(data)
return item
7 .运行爬虫程序
PS D:\scrapy\douban> scrapy crawl doubanmovie
8. 检查日志信息