scrapy基础用法(二)
上次的可以爬取章节名与章节地址,本次的可以爬取小说内容,不过顺序不对
主要逻辑文件./biquge/biquge/spiders/Scrn
# -*- coding: utf-8 -*- import scrapy import re from biquge.items import UrlItem,FieldItem #引入存储字段结构的Item类 class ScrnSpider(scrapy.Spider): name = 'Scrn' #爬虫名,在命令行用scrapy crawl 爬虫名来执行爬虫 allowed_domains = ['www.xbiquge.la'] #爬虫的活动域 start_urls = ['http://www.xbiquge.la/10/10489/'] #爬虫的起始网址,一般为想要爬取的网页地址 def parse(self,response): #默认的爬虫解析函数 for each in response.xpath("//div[@id='list']/dl/dd"): #使用xpath解析获取需要的字段 item = UrlItem() item['title'] = each.xpath("./a/text()").extract_first() #将解析的字段存入item item['url'] = each.xpath("./a/@href").extract_first() yield item #将item抛出给管道文件处理(Pipelines.py),需要在settings文件中设置 class Scrn1Spider(scrapy.Spider): name = "Scrn1" allowed_domains = ['www.xbiquge.la'] # 爬虫的活动域,不可以加http://,否则就变成了URL # allowed_domains = ["http://www.xbiquge.la"] start_urls = ['http://www.xbiquge.la/10/10489/'] # 爬虫的起始网址,一般为想要爬取的网页地址 def parse(self,response): for url in response.xpath("//div[@id='list']/dl/dd/a/@href").extract(): #scrapy.Request(url,callback),向url发送请求,返回的response由callback函数接收并处理 yield scrapy.Request("http://www.xbiquge.la"+url,callback=self.parse1) def parse1(self,response): item = FieldItem() #一定不能写item.title不然怎么死的都不知道 # item.title = response.xpath("//div[@class='box_con']/div[2]/h1/text()").extract()[0] title = response.xpath("//div[@class='box_con']/div[2]/h1/text()").extract()[0] #item['title'] = "-".join(re.split(r"\s+",title.strip())) 一时没找到re的用法
item['title'] = re.sub("\s+","-",title.strip()) #将爬取的题目中的空格转化为"-" content_undeal = response.xpath("//div[@class='box_con']/div[@id='content']/text()").extract() item['content'] = "" for i in content_undeal: i.replace("\xa0","") i.replace("\r","\n") item['content'] += i return item
字段结构存储文件 ./biquge/biquge/items.py
1 import scrapy 2 3 class UrlItem(scrapy.Item): 4 title = scrapy.Field() 5 url = scrapy.Field() 6 7 8 9 class FieldItem(scrapy.Item): 10 #"//div[@class='box_con']/div[2]/h1/text()" 11 title = scrapy.Field() 12 # "//div[@class='box_con']/div[@id='content']/h1/text()" 13 content = scrapy.Field()
存储逻辑文件./biquge/biquge/pipelines.py
1 import json 2 3 class UrlPipeline(object): 4 def __init__(self): 5 self.filename = open("Url.json","wb") #在初始化函数中打开存储数据的文件 6 def process_item(self, item, spider): #用于接收并处理item的方法 7 jsontext = json.dumps(dict(item),ensure_ascii=False)+"\n" #将传入的item用字典加载,再转为json,ensure_ascii=False用于输出中文 8 self.filename.write(jsontext.encode("utf-8")) #将转为json的字段写入文件 9 return item 10 def close_spider(self,spider): #关闭爬虫时执行 11 self.filename.close() #关闭文件 12 13 14 #本来准备存到一个文件中,发现顺序不对,所以存到了一个文件夹中,每章节为一个文件 15 # class FieldPipeline(object): 16 # def __init__(self): 17 # self.filename = open("abc.txt","wb") 18 # def process_item(self,item,spider): 19 # text = "\n"+item['title']+"\n"+item['content'] 20 # self.filename.write(text.encode("utf-8")) 21 # return item 22 # def close_spider(self,spider): 23 # self.filename.close() 24 25 class FieldPipeline(object): 26 def process_item(self,item,spider): 27 with open("./file/"+item['title'],"wb") as f: 28 f.write(item['content'].encode("utf-8")) 29 return item
设置管道文件 ./biquge/biquge/settings.py
1 ITEM_PIPELINES = { 2 'biquge.pipelines.UrlPipeline': 300, 3 'biquge.pipelines.FieldPipeline':200, 4 }
运行爬虫时,根据爬虫名选择执行的逻辑,去调用相应的Item类,在settings.py文件中设置pipeline决定采用存储结构,标号越小代表pipeline调用级别越高
scrapy crawl Scrn 运行Scrn(获取章节名与章节地址)
scrapy crawl Scrn1 运行Scrn1(获取小说章节)