Scrapy 框架
### 安装 流程
- 环境的安装:
- mac/linux:pip install scrapy
- window:
- pip install wheel
- 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
- 进入下载目录,执行 pip install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
- pip install pywin32
- pip install scrapy
# 新建一个 scrapy 工程
scrapy startproject ProName
# 进入爬虫工程
cd ProName
# 创建一个爬虫文件
# python 解释器 scrapy genspider 文件名 url地址
python.exe -m scrapy genspider first www.baidu.com
# 执行工程
python.exe -m scrapy crawl spiderName
first.py 文件内容
# encoding:utf-8
import scrapy
# 爬虫 类 , 父类是 Spider,还有其他形式的父类
class FirstSpider(scrapy.Spider):
# 爬虫文件名称, 爬虫源文件的唯一标识
name = 'first'
# 允许的域名,一般不用
# allowed_domains = ['www.baidu.com']
# 起始的URL请求地址 列表:
# 1. 列表中存放的 URL 都可以被 scrapy 都进行 异步网络请求
# 2.
start_urls = ['https://www.baidu.com/','https://www.sogou.com/']
# 用于数据解析 , 相当于 异步回调的处理 结果的函数
def parse(self, response):
'''
:param response: 响应对象
:return:
'''
print(response)
执行 工程 获取数据
### 执行 scrapy 工程中的爬虫脚本
## robots.txt 问题
# 在setting 中
ROBOTSTXT_OBEY = False
## 日志问题
# 输入日志(全日志)
- python.exe -m scrapy crawl first
# 不输入日志
- python.exe -m scrapy crawl first --nolog
# setting 修改日志等级
LOG_LEVEL='ERROR'
LOG_FILE='./LOG.log'
## UA 伪装,settings.py配置
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
持久化存储
# 两种方式
- 基于终端指令的持久化存储
- 只可以将 parse方法返回的结果存储到本地磁盘中
# 指令 , 只能是符合 scrapy标准的文件格式
- scrapy crawl spiderName -o filepath
- 基于管道的持久化存储
# 编码流程:
- 1. 在爬虫文件中进行解析
- 2. 要在 item.py文件中 创建对应的字段/属性,来存储解析后的结果值
- 3. 在parse将解析的数据封装到 item类型的对象中
- 4. 在parse.py文件中将 item 类型的对象 提交到管道
yield item
- 5. 管道类的 process_item 方法接收到 item,可以执行任意持久化
- 6. 在配置setting文件中开启管道设置
# 300 表示的优先级 , 数值越小优先级越高
ITEM_PIPELINES = {
'choutiPro.pipelines.ChoutiproPipeline': 300,
}
# 一个管道 定义一种存储 数据的方式 : mysql,本地,psql等需要多个管道类,并将管道类添加到settings配置文件中
实例代码
piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ChoutiproPipeline:
fp = None
def open_spider(self, spider):
'''
# 重写父类的方法
# 此方法就执行一次.
:param spider:
:return:
'''
print('开始爬虫')
self.fp = open('chouti.txt', 'w', encoding='utf-8')
def close_spider(self, spider):
'''
# 重写父类的方法
# 此方法就执行一次.
:param spider:
:return:
'''
print('结束爬虫')
self.fp.close()
# 该方法 调用后. 就能接收到 item的对象
def process_item(self, item, spider):
'''
:param item: item对象
:param spider:
:return:
'''
author = item['author']
title = item['title']
print(author, title)
self.fp.write(author + ":" + title + "\n")
return item
import pymysql
class ChoutiproPipelineMySQL():
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connection(
host='127.0.0.1',
port=3306,
user='root',
password='123',
db='chouti_data',
charset='utf8'
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
title = item['title']
author = item['author']
sql = """
insert into chouti values("%s","%s")
""" % (author, title)
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print("错误:", e)
self.conn.rollback()
### 非常重要, 将item 传入到下一个 管道类
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
settings.py
# Scrapy settings for choutiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'choutiPro'
SPIDER_MODULES = ['choutiPro.spiders']
NEWSPIDER_MODULE = 'choutiPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
LOG_FILE='./LOG.log'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'choutiPro.middlewares.ChoutiproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'choutiPro.middlewares.ChoutiproDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'choutiPro.pipelines.ChoutiproPipelineMySQL': 100,
'choutiPro.pipelines.ChoutiproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ChoutiproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# scrapy.Field() 可作为一个万能的数据类型
title = scrapy.Field()
author = scrapy.Field()
chouti.py
import scrapy
from choutiPro.items import ChoutiproItem
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://dig.chouti.com/']
def parse(self, response):
# scapy 封装了xpath
# 解析内容 和 发布平台
div_list = response.xpath('//div[@class="link-con"]/div')
all_data = []
for div in div_list:
title_text = div.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
author = div.xpath('.//div[@class="operate-author-con clearfix"]//span[@class="left author-name"]/text()').extract_first()
"""
Data:
[<Selector xpath='.//div[@class="link-detail"]/a/text()' data='河南博物院上新文物修复盲盒!竟然和做模型有这么多相
似之处!'>] []
# 需要对结果进行 进行提取字符串
extract() : 提取多个 , 相当于列表
extract_first() : 提取第0个元素 , 具体值
"""
# 1. 方式一 基于 命令行存储
# all_data.append({"title_text": title_text, "author": author or 'DD'})
# return all_data
# 2. 基于本地存储
# 实例化 一个 item对象
item = ChoutiproItem()
# 给 item 创建属性值 ,item['title'] 调用的 __setitem__
item['title'] = title_text
item['author'] = author
# 提交管道 , 将item 提交到管道
yield item