scrapy解析数据
# 运行爬虫程序
scrapy crawl cnblogs
##### 可以项目目录下写个main.py
from scrapy.cmdline import execute
execute(['scrapy','crawl','cnblogs','--nolog'])
#### 重点
1 response对象有css方法和xpath方法
-css中写css选择器 response.css('')
-xpath中写xpath选择 response.xpath('')
2 重点1:
-xpath取文本内容
'.//a[contains(@class,"link-title")]/text()'
-xpath取属性
'.//a[contains(@class,"link-title")]/@href'
-css取文本
'a.link-title::text'
-css取属性
'img.image-scale::attr(src)'
3 重点2:
.extract_first() 取一个
.extract() 取所有
import scrapy
class CnblogsSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com"]
def parse(self, response):
article_list = response.css('article.post-item')
for item in article_list:
name = item.css("a.post-item-title::text").extract_first()
article = item.css("a.post-item-author span::text").extract_first()
article_img = item.css("img.avatar::attr(src)").extract_first()
time = item.css("span.post-meta-item span::text").extract_first()
likes = item.css("a.post-meta-item span::text").extract_first()
comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
desc = item.xpath('./section/div/p/text()').extract() # 文本内容可能放在第二个位置
desc_content = desc[0].replace('\n', '').replace(' ', '')
if not desc_content:
desc_content = desc[1].replace('\n', '').replace(' ', '')
desc_add = item.css('section > div > a::attr(href)').extract_first()
print(
'''
文章标题:%s
文章作者:%s
文章摘要:%s
文章地址:%s
头像:%s
发布时间:%s
点赞数:%s
评论数:%s
观看数:%s
''' % (name, article, desc_content, desc_add, article_img, time, likes, comments, num_views))
配置文件
#### 基础配置
# 项目名
BOT_NAME = "scrapy_demo"
# 爬虫所在路径
SPIDER_MODULES = ["scrapy_demo.spiders"]
NEWSPIDER_MODULE = "scrapy_demo.spiders"
# 记住 日志级别
LOG_LEVEL='ERROR'
# 请求头中的 USER_AGENT
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
# 是否遵循爬虫协议
ROBOTSTXT_OBEY = False
# 默认请求头
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
#爬虫中间件
#SPIDER_MIDDLEWARES = {
# "scrapy_demo.middlewares.ScrapyDemoSpiderMiddleware": 543,
#}
# 下载中间件
#DOWNLOADER_MIDDLEWARES = {
# "scrapy_demo.middlewares.ScrapyDemoDownloaderMiddleware": 543,
#}
# 持久化相关
#ITEM_PIPELINES = {
# "scrapy_demo.pipelines.ScrapyDemoPipeline": 300,
#}
### 高级配置(提高爬取效率)
#1 增加并发:默认16
默认scrapy开启的并发线程为32个,可以适当进行增加。在settings配置文件中修改
CONCURRENT_REQUESTS = 100
值为100,并发设置成了为100
#2 提高日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:
LOG_LEVEL = 'INFO'
# 3 禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:
COOKIES_ENABLED = False
# 4 禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:
RETRY_ENABLED = False
# 5 减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:
DOWNLOAD_TIMEOUT = 10 超时时间为10s
整站爬取cnblogs-->爬取详情-->数据传递
# 整站爬取:
爬取所有页
-解析出下一页 yield Request(url=next, callback=self.parse)
爬取文章详情
-解析出详情地址:yield Request(url=url, callback=self.detail_parser)
多个Request之间数据传递
yield Request(url=url,meta={'item':item})
在解析的 response中 response.meta.get('item')
from scrapy import Request
import scrapy
class CnblogsSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com"]
def parse(self, response):
article_list = response.css('article.post-item')
for item in article_list:
name = item.css("a.post-item-title::text").extract_first()
article = item.css("a.post-item-author span::text").extract_first()
article_img = item.css("img.avatar::attr(src)").extract_first()
time = item.css("span.post-meta-item span::text").extract_first()
likes = item.css("a.post-meta-item span::text").extract_first()
comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
desc = item.xpath('./section/div/p/text()').extract() # 文本内容可能放在第二个位置
desc_content = desc[0].replace('\n', '').replace(' ', '')
if not desc_content:
desc_content = desc[1].replace('\n', '').replace(' ', '')
desc_add = item.css('section > div > a::attr(href)').extract_first()
print(
'''
文章标题:%s
文章作者:%s
文章摘要:%s
文章地址:%s
头像:%s
发布时间:%s
点赞数:%s
评论数:%s
观看数:%s
''' % (name, article, desc_content, desc_add, article_img, time, likes, comments, num_views))
item = {"name": name, "url": desc_add, "img": article_img, "text": None}
yield Request(url=desc_add, callback=self.detail_parser, meta={'item': item})
n_next = 'https://www.cnblogs.com' + response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
print(n_next)
yield Request(url=n_next, callback=self.parse)
def detail_parser(self, response):
item = response.meta.get("item")
text = response.css('#cnblogs_post_body').extract_first()
print(text)
item['text'] = text
持久化
# 方式一:(parse必须有return值,必须是列表套字典形式--->使用命令,可以保存到json格式中,csv中。。。)
# cnblogs.py
from scrapy import Request
from requests_demo.items import RequestsDemoItem
import scrapy
class CnblogsSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com"]
def parse(self, response):
article_list = response.css('article.post-item')
for item in article_list:
name = item.css("a.post-item-title::text").extract_first()
article = item.css("a.post-item-author span::text").extract_first()
article_img = item.css("img.avatar::attr(src)").extract_first()
time = item.css("span.post-meta-item span::text").extract_first()
likes = item.css("a.post-meta-item span::text").extract_first()
comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
desc = item.xpath('./section/div/p/text()').extract() # 文本内容可能放在第二个位置
desc_content = desc[0].replace('\n', '').replace(' ', '')
if not desc_content:
desc_content = desc[1].replace('\n', '').replace(' ', '')
desc_add = item.css('section > div > a::attr(href)').extract_first()
item = RequestsDemoItem(name=name, article=article, desc_content=desc_content, desc_add=desc_add,
article_img=article_img, time=time, likes=likes, comments=comments,
num_views=num_views, text=None)
yield Request(url=desc_add, callback=self.detail_parser, meta={'item': item})
n_next = 'https://www.cnblogs.com' + response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
yield Request(url=n_next, callback=self.parse)
def detail_parser(self, response):
item = response.meta.get("item")
text = response.css('#cnblogs_post_body').extract_first()
item['text'] = text
print(item)
yield item
# items.py
import scrapy
class RequestsDemoItem(scrapy.Item):
# name, article, desc_content, desc_add, article_img, time, likes, comments, num_views
name = scrapy.Field()
article = scrapy.Field()
desc_content = scrapy.Field()
desc_add = scrapy.Field()
article_img = scrapy.Field()
time = scrapy.Field()
likes = scrapy.Field()
comments = scrapy.Field()
num_views = scrapy.Field()
text = scrapy.Field()
# pipelines.py 存到文件中
from itemadapter import ItemAdapter
class RequestsDemoPipeline:
# 这个很重要
def __init__(self):
self.f = None
def open_spider(self, spider):
print('开了')
# 打开文件
self.f = open('cnblogs.text', 'at', encoding='utf-8')
def process_item(self, item, spider):
self.f.write('文章标题:%s,作者:%s,作者头像:%s\n' % (item['name'], item['author'], item['img']))
return item
def close_spider(self, spider):
print('关了')
self.f.close()