使用scrapy爬取jian shu文章
settings.py中一些东西的含义可以看一下这里
python的scrapy框架的使用 和xpath的使用 && scrapy中request和response的函数参数 && parse()函数运行机制
目录结构
创建一个scrapy项目(最后那个js是你创建项目的名字)
scrapy startprojects js
创建以crawl为模板的爬虫
scrapy genspider -t crawl jianshu wwww.jianshu.com
一、使用scrapy的crawl模板来创建一个爬虫jianshu.py文件
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from js.items import JsItem class JianshuSpider(CrawlSpider): # 运行scrapy时候的名字 name = 'jianshu' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com'] rules = ( # 要爬取网页上的推荐文章,进而可以使用crawl爬取我们规定的链接 # 网页上的推荐文章链接是“/p/(12个【字母/数字】)” Rule(LinkExtractor(allow=r'.*?p/.*?'), callback='parse_detail', follow=True), ) def parse_detail(self, response): # print(response.text()) # print('-'*30) passage_id = response.url title = response.xpath('//h1[@class="_1RuRku"]/text()').get() time = response.xpath('//div[@class="s-dsoj"]/time/text()').get() author = response.xpath('//span[@class="FxYr8x"]/a/text()').get() body = response.xpath('//article[@class="_2rhmJa"]').get() type = response.xpath('//div[@class="_2Nttfz"]/a/img/text()').getall() type = ','.join(type) # 比如文章链接为https://www.jianshu.com/p/ef7bb28258c8 # 那么文章的id,passage_id就是ef7bb28258c8,因为有的链接后面还有跟上“?参数”,所以我们先按照“?” # 切割链接,然后再取出文章id passage_id = passage_id.split('?')[0] passage_id = passage_id.split('/')[-1] # 有些文章违规在简书上不能看,但是链接还存在,所以我们需要判断一下 if author == None: pass else: # 只要返回的对象是item类型,无论在那个函数里面返回都是返回到pipelines.py的item参数里面 item = JsItem( passage_id = passage_id, title = title, time = time, author = author, body = body, type = type ) yield item
二、item.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy # 定义一下Item的对象的一些变量 class JsItem(scrapy.Item): passage_id = scrapy.Field() title = scrapy.Field() time = scrapy.Field() author = scrapy.Field() body = scrapy.Field() type = scrapy.Field()
三、start.py (启动程序)
from scrapy import cmdline # jianshu是你的爬虫程序文件,注意要把这个命令用split拆开 cmdline.execute("scrapy crawl jianshu".split())
四、pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter from twisted.enterprise import adbapi import pymysql # class JsPipeline: # def process_item(self, item, spider): # return item class JianShuSpiderPipeline(object): def __init__(self): # 连接数据库的一些参数 dpparams = { 'host':'127.0.0.1', 'port': 3306, 'user':'root', 'password':'your_password', 'database':'jianshu', 'charset':'utf8' } # 连接数据库 self.conn = pymysql.connect(**dpparams) # 定义游标 self.cursor = self.conn.cursor() self._sql = None @property # 定义属性 def sql(self): if self._sql == None: self._sql = ''' insert into jsw(passage_id,title,time,author,body,type) values(%s,%s,%s,%s,%s,%s) ''' return self._sql else: return self._sql def process_item(self,item,spider): # 执行sql语句,并传参 self.cursor.execute(self.sql,(item['passage_id'],item['title'],item['time'],item['author'],item['body'],item['type'])) # 记得执行之后要提交 self.conn.commit() # 必须返回,不返回就不会处理下一个item对象 return item # 异步向mysql中插入数据 # class JianShuTwistedPipeline: # def __init__(self): # dpparams = { # 'host':'127.0.0.1', # 'port': 3306, # 'user':'root', # 'password':'qu513712qu', # 'database':'jianshu', # 'charset':'utf-8' # } # self.dbpool = adbapi.Connection('pymysql',**dpparams) # self._sql = None # # @property # def sql(self): # if self._sql == None: # self._sql = ''' # insert into jsw values(%s,%s,%s,%s,%s,%s) # ''' # return self._sql # else: # return self._sql # # def process_item(self,item,spider): # # 把插入Insert函数放到runInteraction里面,插入数据就会变成异步的,如果直接执行函数就是异步执行了 # # runInteraction会返回一个游标给Insert函数 # defer = self.dbpool.runInteraction(self.Insert,item) # defer.addErrback(self.error,item,spider) # # def Insert(self,cursor,item): # cursor.execute(self._sql,(item['passage_id'],item['title'],item['time'],item['author'],item['body'],item['type'])) # # def error(self,error,item,spider): # print('-'*30) # print('error') # print('-'*30)
五、setting.py
# Scrapy settings for js project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'js' SPIDER_MODULES = ['js.spiders'] NEWSPIDER_MODULE = 'js.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'js (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { # 'js.middlewares.JsSpiderMiddleware': 543, 'js.middlewares.UserAgentMiddleWare': 543, } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'js.middlewares.JsDownloaderMiddleware': 543, 'js.middlewares.SeleniumDownloadMiddleware': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 管道里面使用的存储数据的类 'js.pipelines.JianShuSpiderPipeline': 300, # 'js.pipelines.JianShuTwistedPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
六、middlewares.py
# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter import random from selenium import webdriver import time from scrapy.http.response.html import HtmlResponse class JsSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class JsDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class UserAgentMiddleWare: User_Agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" ] def process_request(self,request,spider): user_agent = random.choice(self.User_Agent) request.headers['User-Agent'] = user_agent class SeleniumDownloadMiddleware(object): # 通过selenium+chrome来爬取网页的一些动态加载的数据 def __init__(self): # 将你的chrome.exe文件位置传给它 self.driver = webdriver.Chrome(executable_path=r'D:\python-senium\chromedriver.exe') def process_request(self,request,spider): self.driver.get(request.url) time.sleep(2) try: while 1: # 文章下面有文章分类,我们需要点击加载加载更多才可以找出来这篇文章的所有归属类型 showmore = self.driver.find_element_by_class_name('anticon anticon-down') showmore.click() time.sleep(1) if not showmore: break except: pass # 网页源码 sourse = self.driver.page_source # 构造一个response对象,并返回给jianshu.py response = HtmlResponse(url=self.driver.current_url,body=sourse,request=request,encoding='utf-8') return response