爬取网易新闻五大模块的新闻标题及关键字(scrapy&selenium)
需求:爬取网易新闻五大模块的新闻标题及关键字,并将结果同时存放在本地及数据库中
文件items.py:
import scrapy
class WangyiproItem(scrapy.Item):
title = scrapy.Field()
keywords = scrapy.Field()
爬虫文件:wangyi.py:
import scrapy
from wangyipro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name='wangyi'
start_urls = ['https://news.163.com/']
models_urls = [] #存储五个版块对应页的url
#实例化一个浏览器对象
def __init__(self):
self.bro = webdriver.Chrome(executable_path='chromedriver.exe')
#解析网易新闻主页
def parse(self,response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li')
alist=[1,2,4,5,7] #五大板块对应的索引
for index in alist:
model_url = li_list[index].xpath('./a/@href').extract_first()
self.models_urls.append(model_url)
#对每一个板块对应的页面进行请求
for url in self.models_urls:
yield scrapy.Request(url,callback=self.parse_model)
#每一个板块下的内容是动态加载的,需要使用下载器中间件来处理响应数据-利用selenium来获取动态加载的数据
#解析五大新闻板块下的页面
def parse_model(self,response):
div_list = response.xpath('//div[@class="ndi_main"]/div')
for div in div_list:
title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
item = WangyiproItem()
item['title'] = title
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item})
#解析详情页
def parse_detail(self,response):
keywords = response.xpath('//*[@id="content"]/div[2]//text()').extract()
keywords = ''.join(keywords)
item = response.meta['item']
item['keywords'] = keywords
yield item
def closed(self,spider):
self.bro.quit()
中间件文件middlewares:
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproDownloaderMiddleware:
#拦截请求
def process_request(self, request, spider):
return None
#拦截响应
def process_response(self, request, response, spider):
bro = spider.bro
#只对请求五大板块的url进行响应拦截
if request.url in spider.models_urls:
bro.get(request.url)
sleep(5)
page_text = bro.page_source
new_response = HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request)
return new_response
else:
return response
#拦截发生异常的请求
def process_exception(self, request, exception, spider):
pass
管道文件pipelines.py:
from itemadapter import ItemAdapter
import pymysql
from scrapy.pipelines.images import ImagesPipeline
import scrapy
#存储数据到本地
class WangyiproPipeline:
fp = None
def open_spider(self, spider):
print('开始存储......')
self.fp = open('./wangyi_news.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
keywords = item['keywords']
self.fp.write(title+':'+keywords+'\n')
return item
def close_spider(self,spider):
print('本地存储完成!')
self.fp.close()
#存储数据到数据库
class mysqlpipeline:
conn = None
cursor = None
def open_spider(self,spider):
print('开始连接数据库')
self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123456',
db='wangyi',charset='utf8')
def process_item(self,item,spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into scrapy_test values("%s","%s")'%(item['title'],item['keywords']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
#关闭数据库连接
def close_spider(self,spider):
print('数据库存储结束')
self.cursor.close()
self.conn.close()
配置文件settings.py:
#UA伪装
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_33_0) AppleWebKit/447.36 (KHTML, like Gecko) Chrome/74.0.3777.169 Safari/999.36'
#忽略robots协议
ROBOTSTXT_OBEY = False
#只输出报错的日志信息
LOG_LEVEL = 'ERROR'
#中间件开启
DOWNLOADER_MIDDLEWARES = {
'wangyipro.middlewares.WangyiproDownloaderMiddleware': 543,
}
#管道开启
ITEM_PIPELINES = {
'wangyiPro.pipelines.WangyiproPipeline': 300,
'wangyiPro.pipelines.mysqlpipeline': 301,
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具