scrapy基础用法（二）

上次的可以爬取章节名与章节地址，本次的可以爬取小说内容，不过顺序不对

主要逻辑文件./biquge/biquge/spiders/Scrn

# -*- coding: utf-8 -*-
import scrapy
import re
from biquge.items import UrlItem,FieldItem    #引入存储字段结构的Item类

class ScrnSpider(scrapy.Spider):
    name = 'Scrn'   #爬虫名，在命令行用scrapy crawl 爬虫名来执行爬虫
    allowed_domains = ['www.xbiquge.la']     #爬虫的活动域
    start_urls = ['http://www.xbiquge.la/10/10489/']    #爬虫的起始网址，一般为想要爬取的网页地址

    def parse(self,response):   #默认的爬虫解析函数
        for each in response.xpath("//div[@id='list']/dl/dd"):  #使用xpath解析获取需要的字段
            item = UrlItem()
            item['title'] = each.xpath("./a/text()").extract_first()   #将解析的字段存入item
            item['url'] = each.xpath("./a/@href").extract_first()
            yield item  #将item抛出给管道文件处理(Pipelines.py),需要在settings文件中设置

class Scrn1Spider(scrapy.Spider):
    name = "Scrn1"
    allowed_domains = ['www.xbiquge.la']  # 爬虫的活动域,不可以加http：//,否则就变成了URL
    # allowed_domains = ["http://www.xbiquge.la"]
    start_urls = ['http://www.xbiquge.la/10/10489/']  # 爬虫的起始网址，一般为想要爬取的网页地址
    def parse(self,response):
        for url in response.xpath("//div[@id='list']/dl/dd/a/@href").extract():
            #scrapy.Request(url,callback),向url发送请求，返回的response由callback函数接收并处理
            yield scrapy.Request("http://www.xbiquge.la"+url,callback=self.parse1)
    def parse1(self,response):
        item = FieldItem()
        #一定不能写item.title不然怎么死的都不知道
        # item.title = response.xpath("//div[@class='box_con']/div[2]/h1/text()").extract()[0]
        title = response.xpath("//div[@class='box_con']/div[2]/h1/text()").extract()[0]
        #item['title'] = "-".join(re.split(r"\s+",title.strip())) 　　一时没找到re的用法
　　　　　item['title'] = re.sub("\s+","-",title.strip())　　#将爬取的题目中的空格转化为"-"
        content_undeal = response.xpath("//div[@class='box_con']/div[@id='content']/text()").extract()
        item['content'] = ""
        for i in content_undeal:
            i.replace("\xa0","")
            i.replace("\r","\n")
            item['content'] += i

        return item

字段结构存储文件 ./biquge/biquge/items.py

 1 import scrapy
 2 
 3 class UrlItem(scrapy.Item):
 4     title = scrapy.Field()
 5     url = scrapy.Field()
 6 
 7 
 8 
 9 class FieldItem(scrapy.Item):
10     #"//div[@class='box_con']/div[2]/h1/text()"
11     title = scrapy.Field()
12     # "//div[@class='box_con']/div[@id='content']/h1/text()"
13     content = scrapy.Field()

存储逻辑文件./biquge/biquge/pipelines.py

 1 import json
 2 
 3 class UrlPipeline(object):
 4     def __init__(self):
 5         self.filename = open("Url.json","wb")   #在初始化函数中打开存储数据的文件
 6     def process_item(self, item, spider):   #用于接收并处理item的方法
 7         jsontext = json.dumps(dict(item),ensure_ascii=False)+"\n"   #将传入的item用字典加载,再转为json,ensure_ascii=False用于输出中文
 8         self.filename.write(jsontext.encode("utf-8"))   #将转为json的字段写入文件
 9         return item
10     def close_spider(self,spider):  #关闭爬虫时执行
11         self.filename.close()   #关闭文件
12 
13 
14 #本来准备存到一个文件中，发现顺序不对，所以存到了一个文件夹中，每章节为一个文件
15 # class FieldPipeline(object):
16 #     def __init__(self):
17 #         self.filename = open("abc.txt","wb")
18 #     def process_item(self,item,spider):
19 #         text = "\n"+item['title']+"\n"+item['content']
20 #         self.filename.write(text.encode("utf-8"))
21 #         return item
22 #     def close_spider(self,spider):
23 #         self.filename.close()
24 
25 class FieldPipeline(object):
26     def process_item(self,item,spider):
27         with open("./file/"+item['title'],"wb") as f:
28             f.write(item['content'].encode("utf-8"))
29         return item

设置管道文件 ./biquge/biquge/settings.py

1 ITEM_PIPELINES = {
2     'biquge.pipelines.UrlPipeline': 300,
3     'biquge.pipelines.FieldPipeline':200,
4 }

运行爬虫时，根据爬虫名选择执行的逻辑，去调用相应的Item类，在settings.py文件中设置pipeline决定采用存储结构，标号越小代表pipeline调用级别越高

scrapy crawl Scrn　　运行Scrn(获取章节名与章节地址)

scrapy crawl Scrn1　　运行Scrn1(获取小说章节)

posted on 2019-07-29 21:39 南华阅读(140) 评论(0) 编辑收藏举报

刷新页面返回顶部

南华

scrapy基础用法（二）

导航

公告