python爬虫(16)使用scrapy框架爬取顶点小说网

本文以scrapy 框架来爬取整个顶点小说网的小说


1.scrapy的安装

这个安装教程,网上有很多的例子,这里就不在赘述了

2.关于scrapy

scrapy框架 是一个非常好的东西,能够实现异步爬取,节省时间,其实本文纯粹的按照之前的思维来做,

也不是不可以,但是感觉速度太慢了,毕竟数据量有点大

框架内容也在网上找找例子吧

3.直接说实现吧


使用 

scrapy startproject dingdian
创建项目


然后增加文件,最后代码目录如下:

├── dingdian
│   ├── __init__.py
│   ├── items.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       └── mydingdian.py


主要程序:

mydingdian.py

#coding:utf-8
import scrapy
import re
from scrapy.http import Request
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem

class Myspider(scrapy.Spider):
	name = "dingdian"
	allowed_domains = ["23us.com"]
	bash_url = "http://www.23us.com/class/"
	bashurl='.html'
	
	def start_requests(self):
		#for i in range(1,11):
		for i in range(7,8):
			url=self.bash_url+str(i)+"_1"+self.bashurl
			yield Request(url,self.parse)
	
	def parse(self, response):
		
		baseurl=response.url  #此处得到的url为http://www.23us.com/class/*_1.html
		
		max_num=response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()#获取当前页面的最大页码数
		print max_num
		baseurl=baseurl[:-7]
				
		#for num in xrange(1,int(max_num)+1):
		for num in xrange(1,3):
			newurl=baseurl+"_"+str(num)+self.bashurl
			#此处使用dont_filter和不使用的效果不一样,使用dont_filter就能够抓取到第一个页面的内容,不用就抓不到
			#scrapy会对request的URL去重(RFPDupeFilter),加上dont_filter则告诉它这个URL不参与去重。
			yield Request(newurl,dont_filter=True,callback=self.get_name)#将新的页面url的内容传递给get_name函数去处理
	
	def get_name(self,response):
		for nameinfo in response.xpath('//tr'):
			novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址
			name = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字
			if  novelurl:
				yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'name':name})
			'''
			#在当前页面获取小说详情
			#print nameinfo
			name = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字
			author= nameinfo.xpath('td[3]/text()').extract_first()#小说作者
			novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址
			serialstatus = nameinfo.xpath('td[6]/text()').extract_first()#小说状态
			serialnumber = nameinfo.xpath('td[4]/text()').extract_first()#小说字数
			if  novelurl:
				targentcontent['novel_name']=name
				targentcontent['author']=author
				targentcontent['novelurl']=novelurl
				targentcontent['serialstatus']=serialstatus
				targentcontent['serialnumber']=serialnumber	
				#print name,author,novelurl,serialstatus,serialnumber
			
				yield Request(novelurl,callback=self.get_novelcontent,meta={'targentcontent':targentcontent})
			小说相关的详情可以暂时不传递
			'''
	
	def get_novelcontent(self,response):
		#targentcontent=response.meta['targentcontent']
		#print targentcontent['novelurl'],targentcontent['name']
		#title = response.xpath('//dd[1]/h1/text()').extract_first()
		novel_name=response.meta['name']#小说名字
		author = response.xpath('//tr[1]/td[2]/text()').extract_first()#作者
		novelurl = response.url#小说地址
		serialstatus = response.xpath('//tr[1]/td[3]/text()').extract_first()#状态
		serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()#连载字数
		category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()#小说类别
		name_id = novelurl[-5:]#小说编号
		collect_num_total=response.xpath('//tr[2]/td[1]/text()').extract_first()#总收藏
		click_num_total=response.xpath('//tr[3]/td[1]/text()').extract_first()#总点击
		
		#chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first()
		chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first()
		novel_breif=response.xpath('//dd[2]/p[2]').extract_first()
		
		targentcontent=DingdianItem()
		targentcontent['novel_name']=novel_name
		targentcontent['author']=author
		targentcontent['novelurl']=novelurl
		targentcontent['serialstatus']=serialstatus
		targentcontent['serialnumber']=serialnumber	
		targentcontent['category']=category	
		targentcontent['name_id']=name_id	
		targentcontent['collect_num_total']=collect_num_total	
		targentcontent['click_num_total']=click_num_total	
		targentcontent['novel_breif']=novel_breif	
		#yield targentcontent
		#print novel_name,author,novelurl,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl
		yield Request(chapterlisturl,dont_filter=True,callback=self.get_charaterurl,meta={'targentcontent':targentcontent})
		
		
		
		
		
		
	def get_charaterurl(self,response):
		#print response.url
		item=response.meta['targentcontent']
		for contents in response.xpath('//table/tr'):
			for content in contents.xpath('td'):
				if  content.xpath('a/text()').extract_first():
					#print content.xpath('a/text()').extract_first()
					item['chapterurl']=response.url+content.xpath('a/@href').extract_first()
					item['chaptername']=content.xpath('a/text()').extract_first()
					yield item
		
		
		
		
定义的存贮内容即 items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DingdianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
	novel_name = scrapy.Field()#小说名字
	author = scrapy.Field()#作者
	novelurl = scrapy.Field()#小说地址
	serialstatus = scrapy.Field()#状态
	serialnumber = scrapy.Field()#连载字数
	category = scrapy.Field()#小说类别
	name_id = scrapy.Field()#小说编号
	collect_num_total=scrapy.Field()#总收藏
	click_num_total=scrapy.Field()#总点击
	novel_breif=scrapy.Field()#小说简介
	chapterurl = scrapy.Field()#小说章节地址
	chaptername = scrapy.Field()#小说章节名字

设置相关  settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for dingdian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dingdian'

SPIDER_MODULES = ['dingdian.spiders']
NEWSPIDER_MODULE = 'dingdian.spiders'


PAGE_STORGE="novels"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dingdian (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'dingdian.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'dingdian.pipelines.DingdianPipeline': 100,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
最终的数据处理以及保存

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from dingdian import settings
import os
import urllib2
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem

from bs4 import BeautifulSoup as bs
import sys

reload(sys)
sys.setdefaultencoding('utf-8')


class DingdianPipeline(object):
    def process_item(self, item, spider):
		
		dir_path="%s/%s" % (settings.PAGE_STORGE,spider.name)

		if not os.path.exists(dir_path):
		#	print "dir_path is %s",dir_path
			os.makedirs(dir_path)
		if isinstance(item,DingdianItem):
			novelpath=dir_path+'/'+item['novel_name']
			print novelpath
			if not os.path.exists(novelpath):
				os.makedirs(novelpath)
			novelbreif=item['novel_name']+"_简介"
			novelbreifpath=novelpath+'/'+novelbreif+'.txt'
			if not os.path.exists(novelbreifpath):
				with open(novelbreifpath,'wb') as novel_write:
					novel_write.write(item['novel_name'])
					novel_write.write('\t|\t')
					novel_write.write(item['author'])
					novel_write.write('\t|\t')
					novel_write.write(item['novelurl'])
					novel_write.write('\n')
					novel_write.write(item['serialstatus'])
					novel_write.write('\t|\t')
					novel_write.write(item['serialnumber'])
					novel_write.write('\t|\t')
					novel_write.write(item['category'])
					novel_write.write('\n')
					novel_write.write(item['name_id'])
					novel_write.write('\t|\t')
					novel_write.write(item['collect_num_total'])
					novel_write.write('\t|\t')
					novel_write.write(item['click_num_total'])
					novel_write.write('\n')
					novel_write.write(item['novel_breif'])
					novel_write.close
					
			titlename=item['chaptername']
			titlenamepath=novelpath+'/'+titlename+'.txt'
			print titlenamepath
			chapterurl=item['chapterurl']
			html=urllib2.urlopen(chapterurl).read()
			soup1=bs(html,'lxml')
			if not os.path.exists(titlenamepath):
				with open(titlenamepath,'wb') as file_write:
					cont=soup1.find("dd",attrs={"id":"contents"}).getText()
					#print cont
					file_write.write(cont)
					file_write.close()		
					
		

		
		return item
		
		#-o books.csv 参数的意思是将抓取的Item集合输出到csv文件。
		#除了CSV格式,Scrapy还支持JSON,XML的格式输入



然后运行  

scrapy crawl dingdian
没有报错的话,就等上几个小时,然后就能看到好多小说就躺在自己的电脑上面了





posted @ 2017-03-24 15:52  枫奇丶宛南  阅读(371)  评论(0编辑  收藏  举报