scrapy爬小说程序(mongodb版)的完善
一、背景:原程序爬取小说要求一次成功,否则,必须从头再来,影响爬取效率。
二、完善思路
(1)增加对已爬取内容的检索,若mongodb已有内容,则不再爬取。
(2)增加对总爬取时间的计时。
三、代码
(1)xbiquge/pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import os import time from twisted.enterprise import adbapi from pymongo import MongoClient class XbiqugePipeline(object): conn = MongoClient('mongodb://admin:admin@localhost:27017/admin') db = conn.novels #建立数据库novels的连接对象db name_novel = '' url_firstchapter = '' name_txt = '' start_time=time.time() #定义类初始化动作 def __init__(self): return #爬虫开始 def open_spider(self, spider): return def get_collection(self,name_collection): #获取数据集cursor对象 myset = self.db[name_collection] return myset def process_item(self, item, spider): #if self.name_novel == '': self.name_novel = item['name'] self.url_firstchapter = item['url_firstchapter'] self.name_txt = item['name_txt'] myset = self.db[self.name_novel] myset.insert_one(dict(item)) # if self.name_novel != '': # exec('self.db.'+ self.name_novel + '.insert_one(dict(item))') return item #从数据库取小说章节内容写入txt文件 def content2txt(self,dbname,firsturl,txtname): myset = self.db[dbname] record_num = myset.find().count() #获取小说章节数量 print("小说总章节数:",record_num) counts=record_num url_c = firsturl start_time=time.time() #获取提取小说内容程序运行的起始时间 f = open(txtname+".txt", mode='w', encoding='utf-8') #写方式打开小说名称加txt组成的文件 for i in range(counts): #括号中为counts #-----------使用count()方法获得的返回整型值作为是否获得数据的判断依据------------- # record_m_count=myset.find({"url": url_c},{"content":1,"_id":0}).count() # if record_m_count == 0: # print("数据集中没有找到章节内容。\n出错url:",url_c) # break #-------------------------------------------------------------------------------- #-----------使用next()方法读取迭代器数据,并使用try except捕获未获得数据的错误----- try: record_m=myset.find({"url": url_c},{"content":1,"_id":0}).next() #except Exception as e: except StopIteration: print("数据集中没有获得章节内容。\n出错url:",url_c) break #跳出for循环,终止小说文件生成 #-------------------------------------------------------------------------------- record_content_c2a0 = '' #------------使用for循环读取迭代器数据模式--------------------------------- # record_i = myset.find({"url": url_c},{"content":1,"_id":0}) # for record_m in record_i: # record_content_c2a0 = record_m["content"] #获取小说章节内容 #--------------------------------------------------------------------------- record_content_c2a0 = record_m["content"] #record_content=record_content_c2a0.replace(u'\xa0', u'') #消除特殊字符\xc2\xa0 record_content=record_content_c2a0 #print(record_content) f.write('\n') f.write(record_content + '\n') f.write('\n\n') url_ct = myset.find({"url": url_c},{"next_page":1,"_id":0}) #获取下一章链接的查询对象 for item_url in url_ct: url_c = item_url["next_page"] #下一章链接地址赋值给url_c,准备下一次循环。 #print("下一页",url_c) f.close() print("文件生成用时:",time.time()-start_time) print("小说爬取总用时:",time.time()-self.start_time) print(txtname + ".txt" + " 文件已生成!") return #爬虫结束,调用content2txt方法,生成txt文件 def close_spider(self,spider): if self.name_novel !='' and self.url_firstchapter != '' and self.name_txt != '': self.content2txt(self.name_novel,self.url_firstchapter,self.name_txt) return
(2)爬虫示例代码xbiquge/spiders/sancun.py
# -*- coding: utf-8 -*- import scrapy from xbiquge.items import XbiqugeItem from xbiquge.pipelines import XbiqugePipeline import pdb class SancunSpider(scrapy.Spider): name = 'sancun' allowed_domains = ['www.xbiquge.la'] #start_urls = ['https://www.xbiquge.la/10/10489/'] url_ori= "https://www.xbiquge.la" url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html" name_txt = "./novels/三寸人间" index_FS = url_firstchapter.rfind('/') #从右到左定位第一个正斜杠的位置 #url_chapters = url_firstchapter[0:32] #截取字符串包括尾部的正斜杠 url_chapters = url_firstchapter[0:index_FS+1] #截取目录页面字符串,包括尾部的正斜杠 pipeline=XbiqugePipeline() novelcollection=pipeline.get_collection(name) #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table #-------------------------------------------- #如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多个目录页面url,使得无法获得最新内容。 if novelcollection.find({"next_page":url_chapters}).count() != 0 : print("包含目录页面url的记录:",novelcollection.find({"next_page":url_chapters},{"_id":0,"id":1,"url":1,"next_page":1}).next()) # pdb.set_trace() novelcollection.remove({"next_page":url_chapters}) print("已删除包含目录页面url的记录。") #-------------------------------------------- novelcounts=novelcollection.find().count() novelurls=novelcollection.find({},{"_id":0,"id":1,"url":1}) item = XbiqugeItem() item['id'] = novelcounts #id置初值为colletion的记录总数 item['name'] = name item['url_firstchapter'] = url_firstchapter item['name_txt'] = name_txt def start_requests(self): start_urls = [self.url_chapters] print("小说目录url:",start_urls) for url in start_urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。 f = open("/root/xbiquge_w/url_list.txt","w") #打开文件,以便写入抓取页面url count_bingo=0 #数据集中已有记录的条数 dl = response.css('#list dl dd') #提取章节链接相关信息 for dd in dl: count_iterator = 0 self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0] #组合形成小说的各章节链接 #print("网页提取url:", self.url_c) self.novelurls=self.novelcollection.find({},{"_id":0,"id":1,"url":1}) #通过重新赋值迭代器来重置迭代器指针,使for循环能够 从头遍历迭代器。 for url in self.novelurls: #print("mongodb提取url:", url) if url["url"]==self.url_c: #如果数据集中找到与网页提取的url值相同,则跳出循环 count_bingo += 1 count_iterator += 1 break if count_iterator != 0 : #如果有命中结果,则继续下一个循环,不执行爬取动作 continue #print("爬取url:",self.url_c) f.write("爬取url:"+self.url_c+"\n") #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True) yield scrapy.Request(self.url_c, callback=self.parse_c) #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接 、下一页链接和章节内容信息。 #print(self.url_c) f.close() print("数据集已有记录数count_bingo:",count_bingo) def parse_c(self, response): self.item['id'] += 1 self.item['url'] = response.url self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1] self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3] title = response.css('.con_top::text').extract()[4] contents = response.css('#content::text').extract() text='' for content in contents: text = text + content #print(text) self.item['content'] = title + "\n" + text.replace('\15', '\n') #各章节标题和内容组合成content数据,\15是^M的八进制表示,>需要替换为换行符。 yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。 if self.item['url'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')] == self.item['next_page'][self.url_firstchapter.rfind('/')+1:self.url_firstchapter.rfind('.')]: #同一章有分页的处理 self.url_c = self.item['next_page'] yield scrapy.Request(self.url_c, callback=self.parse_c)